From 08a33805518e7845486f88287e8aace6f8439391 Mon Sep 17 00:00:00 2001
From: Alison Schofield <amsfield22@gmail.com>
Date: Wed, 9 Mar 2016 11:30:12 -0800
Subject: iio: core: implement iio_device_{claim|release}_direct_mode()

It is often the case that the driver wants to be sure a device stays
in direct mode while it is executing a task or series of tasks.  To
accomplish this today, the driver performs this sequence: 1) take the
device state lock, 2) verify it is not in a buffered mode, 3) execute
some tasks, and 4) release that lock.

This patch introduces a pair of helper functions that simplify these
steps and make it more semantically expressive.

iio_device_claim_direct_mode()
        If the device is not in any buffered mode it is guaranteed
        to stay that way until iio_release_direct_mode() is called.

iio_device_release_direct_mode()
        Release the claim. Device is no longer guaranteed to stay
        in direct mode.

Signed-off-by: Alison Schofield <amsfield22@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/iio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index b2b16772c651..0b2773ada0ba 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -527,6 +527,8 @@ void iio_device_unregister(struct iio_dev *indio_dev);
 int devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev);
 void devm_iio_device_unregister(struct device *dev, struct iio_dev *indio_dev);
 int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp);
+int iio_device_claim_direct_mode(struct iio_dev *indio_dev);
+void iio_device_release_direct_mode(struct iio_dev *indio_dev);
 
 extern struct bus_type iio_bus_type;
 
-- 
cgit v1.2.3


From 49db08c358873af11ba3c25401de88156fa5d365 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 19 Feb 2016 15:36:07 +0100
Subject: chrdev: emit a warning when we go below dynamic major range

Currently a dynamically allocated character device major is taken
from 254 and downward. This mechanism is used for RTC, IIO and a
few other subsystems.

The kernel currently has no check prevening these dynamic
allocations from eating into the assigned numbers at 233 and
downward.

In a recent test it was reported that so many dynamic device
majors were used on a test server, that the major number for
infiniband (231) was stolen. This occurred when allocating a new
major number for GPIO chips. The error messages from the kernel
were not helpful. (See: https://lkml.org/lkml/2016/2/14/124)

This patch adds a defined lower limit of the dynamic major
allocation region will henceforth emit a warning if we start to
eat into the assigned numbers. It does not do any semantic
changes and will not change the kernels behaviour: numbers will
still continue to be stolen, but we will know from dmesg what
is going on.

This also updates the Documentation/devices.txt to clearly
reflect that we are using this range of major numbers for dynamic
allocation.

Reported-by: Ying Huang <ying.huang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/devices.txt | 6 +++---
 fs/char_dev.c             | 4 ++++
 include/linux/fs.h        | 2 ++
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 87b4c5e82d39..0a3588a9798d 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -3099,9 +3099,9 @@ Your cooperation is appreciated.
 		129 = /dev/ipath_sma    Device used by Subnet Management Agent
 		130 = /dev/ipath_diag   Device used by diagnostics programs
 
-234-239		UNASSIGNED
-
-240-254 char	LOCAL/EXPERIMENTAL USE
+234-254	char	RESERVED FOR DYNAMIC ASSIGNMENT
+		Character devices that request a dynamic allocation of major number will
+		take numbers starting from 254 and downward.
 
 240-254 block	LOCAL/EXPERIMENTAL USE
 		Allocated for local/experimental use.  For devices not
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 24b142569ca9..687471dc04a0 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -91,6 +91,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 				break;
 		}
 
+		if (i < CHRDEV_MAJOR_DYN_END)
+			pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range",
+				name, i);
+
 		if (i == 0) {
 			ret = -EBUSY;
 			goto out;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..60082be96de8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2385,6 +2385,8 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 
 /* fs/char_dev.c */
 #define CHRDEV_MAJOR_HASH_SIZE	255
+/* Marks the bottom of the first segment of free char majors */
+#define CHRDEV_MAJOR_DYN_END 234
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int __register_chrdev(unsigned int major, unsigned int baseminor,
-- 
cgit v1.2.3


From b3c1be1b789cca6d3e39c950dfed690f0511fe76 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 22 Jan 2016 11:28:07 -0500
Subject: base: isa: Remove X86_32 dependency

Many motherboards utilize a LPC to ISA bridge in order to decode
ISA-style port-mapped I/O addresses. This is particularly true for
embedded motherboards supporting the PC/104 bus (a bus specification
derived from ISA).

These motherboards are now commonly running 64-bit x86 processors. The
X86_32 dependency should be removed from the ISA bus configuration
option in order to support these newer motherboards.

A new config option, CONFIG_ISA_BUS, is introduced to allow for the
compilation of the ISA bus driver independent of the CONFIG_ISA option.
Devices which communicate via ISA-compatible buses can now be supported
independent of the dependencies of the CONFIG_ISA option.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig      | 6 ++++++
 drivers/base/Makefile | 2 +-
 include/linux/isa.h   | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605831f..a5977986f38b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2472,10 +2472,16 @@ config ISA_DMA_API
 	  Enables ISA-style DMA support for devices requiring such controllers.
 	  If unsure, say Y.
 
+config ISA_BUS
+	bool "ISA bus support"
+	help
+	  Enables ISA bus support for devices requiring such controllers.
+
 if X86_32
 
 config ISA
 	bool "ISA support"
+	depends on ISA_BUS
 	---help---
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 6b2a84e7f2be..4ebfb81cc7e9 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_ISA)	+= isa.o
+obj-$(CONFIG_ISA_BUS)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
diff --git a/include/linux/isa.h b/include/linux/isa.h
index b0270e3814c8..2a02862775eb 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,7 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
-#ifdef CONFIG_ISA
+#ifdef CONFIG_ISA_BUS
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
 #else
-- 
cgit v1.2.3


From f235541699bcf14fb8be797c6bc1d7106df0eb64 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 22 Jan 2016 01:32:26 -0500
Subject: export.h: allow for per-symbol configurable EXPORT_SYMBOL()

Similar to include/generated/autoconf.h, include/generated/autoksyms.h
will contain a list of defines for each EXPORT_SYMBOL() that we want
active. The format is:

  #define __KSYM_<symbol_name> 1

This list will be auto-generated with another patch.  For now we only
include the preprocessor magic to automatically create or omit the
corresponding struct kernel_symbol declaration.

Given the content of include/generated/autoksyms.h may not be known in
advance, an empty file is created early on to let the build proceed.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Makefile               |  2 ++
 include/linux/export.h | 22 ++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index 916b26e999d8..451acbebee97 100644
--- a/Makefile
+++ b/Makefile
@@ -998,6 +998,8 @@ prepare2: prepare3 outputmakefile asm-generic
 prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
                    include/config/auto.conf
 	$(cmd_crmodverdir)
+	$(Q)test -e include/generated/autoksyms.h || \
+	    touch   include/generated/autoksyms.h
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
diff --git a/include/linux/export.h b/include/linux/export.h
index 96e45ea463e7..77afdb2a2506 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -38,7 +38,7 @@ extern struct module __this_module;
 
 #ifdef CONFIG_MODULES
 
-#ifndef __GENKSYMS__
+#if defined(__KERNEL__) && !defined(__GENKSYMS__)
 #ifdef CONFIG_MODVERSIONS
 /* Mark the CRC weak since genksyms apparently decides not to
  * generate a checksums for some symbols */
@@ -53,7 +53,7 @@ extern struct module __this_module;
 #endif
 
 /* For every exported symbol, place a struct in the __ksymtab section */
-#define __EXPORT_SYMBOL(sym, sec)				\
+#define ___EXPORT_SYMBOL(sym, sec)				\
 	extern typeof(sym) sym;					\
 	__CRC_SYMBOL(sym, sec)					\
 	static const char __kstrtab_##sym[]			\
@@ -65,6 +65,24 @@ extern struct module __this_module;
 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
 
+#ifdef CONFIG_TRIM_UNUSED_KSYMS
+
+#include <linux/kconfig.h>
+#include <generated/autoksyms.h>
+
+#define __EXPORT_SYMBOL(sym, sec)				\
+	__cond_export_sym(sym, sec, config_enabled(__KSYM_##sym))
+#define __cond_export_sym(sym, sec, conf)			\
+	___cond_export_sym(sym, sec, conf)
+#define ___cond_export_sym(sym, sec, enabled)			\
+	__cond_export_sym_##enabled(sym, sec)
+#define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec)
+#define __cond_export_sym_0(sym, sec) /* nothing */
+
+#else
+#define __EXPORT_SYMBOL ___EXPORT_SYMBOL
+#endif
+
 #define EXPORT_SYMBOL(sym)					\
 	__EXPORT_SYMBOL(sym, "")
 
-- 
cgit v1.2.3


From c1a95fda2a40ae8c7aad3fa44fa7718a3710eb2d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 22 Jan 2016 13:41:57 -0500
Subject: kbuild: add fine grained build dependencies for exported symbols

Like with kconfig options, we now have the ability to compile in and
out individual EXPORT_SYMBOL() declarations based on the content of
include/generated/autoksyms.h.  However we don't want the entire
world to be rebuilt whenever that file is touched.

Let's apply the same build dependency trick used for CONFIG_* symbols
where the time stamp of empty files whose paths matching those symbols
is used to trigger fine grained rebuilds. In our case the key is the
symbol name passed to EXPORT_SYMBOL().

However, unlike config options, we cannot just use fixdep to parse
the source code for EXPORT_SYMBOL(ksym) because several variants exist
and parsing them all in a separate tool, and keeping it in synch, is
not trivially maintainable.  Furthermore, there are variants such as

	EXPORT_SYMBOL_GPL(pci_user_read_config_##size);

that are instanciated via a macro for which we can't easily determine
the actual exported symbol name(s) short of actually running the
preprocessor on them.

Storing the symbol name string in a special ELF section doesn't work
for targets that output assembly or preprocessed source.

So the best way is really to leverage the preprocessor by having it
output actual symbol names anchored by a special sequence that can be
easily filtered out. Then the list of symbols is simply fed to fixdep
to be merged with the other dependencies.

That implies the preprocessor is executed twice for each source file.
A previous attempt relied on a warning pragma for each EXPORT_SYMBOL()
instance that was filtered apart from stderr by the build system with
a sed script during the actual compilation pass. Unfortunately the
preprocessor/compiler diagnostic output isn't stable between versions
and this solution, although more efficient, was deemed too fragile.

Because of the lowercasing performed by fixdep, there might be name
collisions triggering spurious rebuilds for similar symbols. But this
shouldn't be a big issue in practice. (This is the case for CONFIG_*
symbols and I didn't want to be different here, whatever the original
reason for doing so.)

To avoid needless build overhead, the exported symbol name gathering is
performed only when CONFIG_TRIM_UNUSED_KSYMS is selected.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/export.h | 13 ++++++++++++-
 scripts/Kbuild.include | 27 +++++++++++++++++++++++++++
 scripts/basic/fixdep.c |  1 +
 3 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 77afdb2a2506..2f9ccbe6a639 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -65,7 +65,18 @@ extern struct module __this_module;
 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
 
-#ifdef CONFIG_TRIM_UNUSED_KSYMS
+#if defined(__KSYM_DEPS__)
+
+/*
+ * For fine grained build dependencies, we want to tell the build system
+ * about each possible exported symbol even if they're not actually exported.
+ * We use a string pattern that is unlikely to be valid code that the build
+ * system filters out from the preprocessor output (see ksym_dep_filter
+ * in scripts/Kbuild.include).
+ */
+#define __EXPORT_SYMBOL(sym, sec)	=== __KSYM_##sym ===
+
+#elif defined(CONFIG_TRIM_UNUSED_KSYMS)
 
 #include <linux/kconfig.h>
 #include <generated/autoksyms.h>
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 80ca538bfba9..a09927e02713 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -258,12 +258,39 @@ if_changed_dep = $(if $(strip $(any-prereq) $(arg-check) ),                  \
 	@set -e;                                                             \
 	$(cmd_and_fixdep), @:)
 
+ifndef CONFIG_TRIM_UNUSED_KSYMS
+
 cmd_and_fixdep =                                                             \
 	$(echo-cmd) $(cmd_$(1));                                             \
 	scripts/basic/fixdep $(depfile) $@ '$(make-cmd)' > $(dot-target).tmp;\
 	rm -f $(depfile);                                                    \
 	mv -f $(dot-target).tmp $(dot-target).cmd;
 
+else
+
+# Filter out exported kernel symbol names from the preprocessor output.
+# See also __KSYM_DEPS__ in include/linux/export.h.
+# We disable the depfile generation here, so as not to overwrite the existing
+# depfile while fixdep is parsing it.
+flags_nodeps = $(filter-out -Wp$(comma)-M%, $($(1)))
+ksym_dep_filter =                                                            \
+	case "$(1)" in                                                       \
+	cc_*_c) $(CPP) $(call flags_nodeps,c_flags) -D__KSYM_DEPS__ $< ;;    \
+	as_*_S) $(CPP) $(call flags_nodeps,a_flags) -D__KSYM_DEPS__ $< ;;    \
+	boot*|build*|*cpp_lds_S|dtc|host*|vdso*) : ;;                        \
+	*) echo "Don't know how to preprocess $(1)" >&2; false ;;            \
+	esac | sed -rn 's/^.*=== __KSYM_(.*) ===.*$$/KSYM_\1/p'
+
+cmd_and_fixdep =                                                             \
+	$(echo-cmd) $(cmd_$(1));                                             \
+	$(ksym_dep_filter) |                                                 \
+		scripts/basic/fixdep -e $(depfile) $@ '$(make-cmd)'          \
+			> $(dot-target).tmp;	                             \
+	rm -f $(depfile);                                                    \
+	mv -f $(dot-target).tmp $(dot-target).cmd;
+
+endif
+
 # Usage: $(call if_changed_rule,foo)
 # Will check if $(cmd_foo) or any of the prerequisites changed,
 # and if so will execute $(rule_foo).
diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c
index 7e90a1f7de0f..746ec1ece614 100644
--- a/scripts/basic/fixdep.c
+++ b/scripts/basic/fixdep.c
@@ -358,6 +358,7 @@ static void parse_dep_file(void *map, size_t len)
 
 			/* Ignore certain dependencies */
 			if (strrcmp(s, "include/generated/autoconf.h") &&
+			    strrcmp(s, "include/generated/autoksyms.h") &&
 			    strrcmp(s, "arch/um/include/uml-config.h") &&
 			    strrcmp(s, "include/linux/kconfig.h") &&
 			    strrcmp(s, ".ver")) {
-- 
cgit v1.2.3


From 44debe7a123cc760fc90ccbe253210798c917fa7 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 30 Mar 2016 11:26:35 +0200
Subject: vgacon: dummy implementation for vgacon_text_force
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows us to ditch a ton of ugly #ifdefs from a bunch of drm modeset
drivers.

v2: Make the dummy function actually return a sane value, spotted by
Ville.

v3: Because the patch is still in limbo there's no more drivers to
convert, noticed by Emil.

v4: Rebase once more, because hooray. I'll just go ahead an apply this
one later on to drm-misc.

Cc: Emil Velikov <emil.l.velikov@gmail.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 --
 drivers/gpu/drm/ast/ast_drv.c           | 2 --
 drivers/gpu/drm/cirrus/cirrus_drv.c     | 2 --
 drivers/gpu/drm/i915/i915_drv.c         | 2 --
 drivers/gpu/drm/mgag200/mgag200_drv.c   | 2 --
 drivers/gpu/drm/nouveau/nouveau_drm.c   | 2 --
 drivers/gpu/drm/qxl/qxl_drv.c           | 2 --
 drivers/gpu/drm/radeon/radeon_drv.c     | 2 --
 drivers/gpu/drm/virtio/virtgpu_drv.c    | 2 --
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c     | 2 --
 include/linux/console.h                 | 2 ++
 11 files changed, 2 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ce79a8b605a0..fba20bd59cfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -539,12 +539,10 @@ static struct pci_driver amdgpu_kms_pci_driver = {
 
 static int __init amdgpu_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force()) {
 		DRM_ERROR("VGACON disables amdgpu kernel modesetting.\n");
 		return -EINVAL;
 	}
-#endif
 	DRM_INFO("amdgpu kernel modesetting enabled.\n");
 	driver = &kms_driver;
 	pdriver = &amdgpu_kms_pci_driver;
diff --git a/drivers/gpu/drm/ast/ast_drv.c b/drivers/gpu/drm/ast/ast_drv.c
index 9a32d9dfdd26..fcd9c0714836 100644
--- a/drivers/gpu/drm/ast/ast_drv.c
+++ b/drivers/gpu/drm/ast/ast_drv.c
@@ -218,10 +218,8 @@ static struct drm_driver driver = {
 
 static int __init ast_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && ast_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (ast_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/cirrus/cirrus_drv.c b/drivers/gpu/drm/cirrus/cirrus_drv.c
index b1619e29a564..b394e6d8f01e 100644
--- a/drivers/gpu/drm/cirrus/cirrus_drv.c
+++ b/drivers/gpu/drm/cirrus/cirrus_drv.c
@@ -162,10 +162,8 @@ static struct pci_driver cirrus_pci_driver = {
 
 static int __init cirrus_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && cirrus_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (cirrus_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 44912ecebc1a..8a62690e6513 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1750,10 +1750,8 @@ static int __init i915_init(void)
 	if (i915.modeset == 0)
 		driver.driver_features &= ~DRIVER_MODESET;
 
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && i915.modeset == -1)
 		driver.driver_features &= ~DRIVER_MODESET;
-#endif
 
 	if (!(driver.driver_features & DRIVER_MODESET)) {
 		/* Silently fail loading to not upset userspace. */
diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.c b/drivers/gpu/drm/mgag200/mgag200_drv.c
index b0af77454d52..ebb470ff7200 100644
--- a/drivers/gpu/drm/mgag200/mgag200_drv.c
+++ b/drivers/gpu/drm/mgag200/mgag200_drv.c
@@ -116,10 +116,8 @@ static struct pci_driver mgag200_pci_driver = {
 
 static int __init mgag200_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && mgag200_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (mgag200_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index bb8498c9b13e..731c5c2a8933 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -1082,10 +1082,8 @@ nouveau_drm_init(void)
 	nouveau_display_options();
 
 	if (nouveau_modeset == -1) {
-#ifdef CONFIG_VGA_CONSOLE
 		if (vgacon_text_force())
 			nouveau_modeset = 0;
-#endif
 	}
 
 	if (!nouveau_modeset)
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index 7307b07fe06b..dc9df5fe50ba 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -272,10 +272,8 @@ static struct drm_driver qxl_driver = {
 
 static int __init qxl_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && qxl_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (qxl_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index cad25557650f..ad136fc081c8 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -558,12 +558,10 @@ static struct pci_driver radeon_kms_pci_driver = {
 
 static int __init radeon_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && radeon_modeset == -1) {
 		DRM_INFO("VGACON disable radeon kernel modesetting.\n");
 		radeon_modeset = 0;
 	}
-#endif
 	/* set to modesetting by default if not nomodeset */
 	if (radeon_modeset == -1)
 		radeon_modeset = 1;
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index 7f898cfdc746..3cc7afa77a35 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -42,10 +42,8 @@ module_param_named(modeset, virtio_gpu_modeset, int, 0400);
 
 static int virtio_gpu_probe(struct virtio_device *vdev)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && virtio_gpu_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (virtio_gpu_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 0ee76e523a90..fa10395e2a18 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1529,10 +1529,8 @@ static int __init vmwgfx_init(void)
 {
 	int ret;
 
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force())
 		return -EINVAL;
-#endif
 
 	ret = drm_pci_init(&driver, &vmw_pci_driver);
 	if (ret)
diff --git a/include/linux/console.h b/include/linux/console.h
index ea731af2451e..e49cc1ef19be 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -191,6 +191,8 @@ void vcs_remove_sysfs(int index);
 
 #ifdef CONFIG_VGA_CONSOLE
 extern bool vgacon_text_force(void);
+#else
+static inline bool vgacon_text_force(void) { return false; }
 #endif
 
 #endif /* _LINUX_CONSOLE_H */
-- 
cgit v1.2.3


From 8cb359e3a1f6318f971bec281623613f48b711be Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Wed, 23 Mar 2016 12:34:41 +0000
Subject: iio: buffer: add missing descriptions in iio_buffer_access_funcs

The members buffer_group and attrs of iio_buffer_access_funcs have no
descriptions for the documentation. Adding them.

Fixes: 08e7e0adaa17 ("iio: buffer: Allocate standard attributes in the core")
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/buffer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 2ec3ad58e8a0..70a5164f4728 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -83,10 +83,12 @@ struct iio_buffer_access_funcs {
  * @access:		[DRIVER] buffer access functions associated with the
  *			implementation.
  * @scan_el_dev_attr_list:[INTERN] list of scan element related attributes.
+ * @buffer_group:	[INTERN] attributes of the buffer group
  * @scan_el_group:	[DRIVER] attribute group for those attributes not
  *			created from the iio_chan_info array.
  * @pollq:		[INTERN] wait queue to allow for polling on the buffer.
  * @stufftoread:	[INTERN] flag to indicate new data.
+ * @attrs:		[INTERN] standard attributes of the buffer
  * @demux_list:		[INTERN] list of operations required to demux the scan.
  * @demux_bounce:	[INTERN] buffer for doing gather from incoming scan.
  * @buffer_list:	[INTERN] entry in the devices list of current buffers.
-- 
cgit v1.2.3


From fddcca5107051adf9e4481d2a79ae0616577fd2c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Feb 2016 13:20:28 +0100
Subject: mtd: avoid stack overflow in MTD CFI code

When map_word gets too large, we use a lot of kernel stack, and for
MTD_MAP_BANK_WIDTH_32, this means we use more than the recommended
1024 bytes in a number of functions:

drivers/mtd/chips/cfi_cmdset_0020.c: In function 'cfi_staa_write_buffers':
drivers/mtd/chips/cfi_cmdset_0020.c:651:1: warning: the frame size of 1336 bytes is larger than 1024 bytes [-Wframe-larger-than=]
drivers/mtd/chips/cfi_cmdset_0020.c: In function 'cfi_staa_erase_varsize':
drivers/mtd/chips/cfi_cmdset_0020.c:972:1: warning: the frame size of 1208 bytes is larger than 1024 bytes [-Wframe-larger-than=]
drivers/mtd/chips/cfi_cmdset_0001.c: In function 'do_write_buffer':
drivers/mtd/chips/cfi_cmdset_0001.c:1835:1: warning: the frame size of 1240 bytes is larger than 1024 bytes [-Wframe-larger-than=]

This can be avoided if all operations on the map word are done
indirectly and the stack gets reused between the calls. We can
mostly achieve this by selecting MTD_COMPLEX_MAPPINGS whenever
MTD_MAP_BANK_WIDTH_32 is set, but for the case that no other
bank width is enabled, we also need to use a non-constant
map_bankwidth() to convince the compiler to use less stack.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[Brian: this patch mostly achieves its goal by forcing
    MTD_COMPLEX_MAPPINGS (and the accompanying indirection) for 256-bit
    mappings; the rest of the change is mostly a wash, though it helps
    reduce stack size slightly. If we really care about supporting
    256-bit mappings though, we should consider rewriting some of this
    code to avoid keeping and assigning so many 256-bit objects on the
    stack.]
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/chips/Kconfig |  1 +
 include/linux/mtd/map.h   | 19 +++++++------------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig
index 3b3dabce58de..bbfa1f129266 100644
--- a/drivers/mtd/chips/Kconfig
+++ b/drivers/mtd/chips/Kconfig
@@ -115,6 +115,7 @@ config MTD_MAP_BANK_WIDTH_16
 
 config MTD_MAP_BANK_WIDTH_32
 	bool "Support 256-bit buswidth" if MTD_CFI_GEOMETRY
+	select MTD_COMPLEX_MAPPINGS if HAS_IOMEM
 	default n
 	help
 	  If you wish to support CFI devices on a physical bus which is
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 5e0eb7ccabd4..3aa56e3104bb 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -122,18 +122,13 @@
 #endif
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_32
-# ifdef map_bankwidth
-#  undef map_bankwidth
-#  define map_bankwidth(map) ((map)->bankwidth)
-#  undef map_bankwidth_is_large
-#  define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
-#  undef map_words
-#  define map_words(map) map_calc_words(map)
-# else
-#  define map_bankwidth(map) 32
-#  define map_bankwidth_is_large(map) (1)
-#  define map_words(map) map_calc_words(map)
-# endif
+/* always use indirect access for 256-bit to preserve kernel stack */
+# undef map_bankwidth
+# define map_bankwidth(map) ((map)->bankwidth)
+# undef map_bankwidth_is_large
+# define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
+# undef map_words
+# define map_words(map) map_calc_words(map)
 #define map_bankwidth_is_32(map) (map_bankwidth(map) == 32)
 #undef MAX_MAP_BANKWIDTH
 #define MAX_MAP_BANKWIDTH 32
-- 
cgit v1.2.3


From 5651d6aaf489d1db48c253cf884b40214e91c2c5 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 26 Feb 2016 11:50:28 +0100
Subject: mtd: bcm47xxsflash: use ioremap_cache() instead of KSEG0ADDR()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using KSEG0ADDR makes code highly MIPS dependent and not portable.
Thanks to the fix a68f376 ("MIPS: io.h: Define `ioremap_cache'") we can
use ioremap_cache which is generic and supported on MIPS as well now.

KSEG0ADDR was translating 0x1c000000 into 0x9c000000. With ioremap_cache
we use MIPS's __ioremap (and then remap_area_pages). This results in
different address (e.g. 0xc0080000) but it still should be cached as
expected and it was successfully tested with BCM47186B0.

Other than that drivers/bcma/driver_chipcommon_sflash.c nicely setups a
struct resource for access window, but we wren't using it. Use it now
and drop duplicated info.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
---
 drivers/bcma/driver_chipcommon_sflash.c     |  1 -
 drivers/mtd/devices/bcm47xxsflash.c         | 29 ++++++++++++++++++++++++-----
 drivers/mtd/devices/bcm47xxsflash.h         |  3 ++-
 include/linux/bcma/bcma_driver_chipcommon.h |  1 -
 4 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_chipcommon_sflash.c b/drivers/bcma/driver_chipcommon_sflash.c
index 04d706ca5f43..35b13a08ca3e 100644
--- a/drivers/bcma/driver_chipcommon_sflash.c
+++ b/drivers/bcma/driver_chipcommon_sflash.c
@@ -146,7 +146,6 @@ int bcma_sflash_init(struct bcma_drv_cc *cc)
 		return -ENOTSUPP;
 	}
 
-	sflash->window = BCMA_SOC_FLASH2;
 	sflash->blocksize = e->blocksize;
 	sflash->numblocks = e->numblocks;
 	sflash->size = sflash->blocksize * sflash->numblocks;
diff --git a/drivers/mtd/devices/bcm47xxsflash.c b/drivers/mtd/devices/bcm47xxsflash.c
index 347bb83db864..1c65c15b31a1 100644
--- a/drivers/mtd/devices/bcm47xxsflash.c
+++ b/drivers/mtd/devices/bcm47xxsflash.c
@@ -2,6 +2,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
 #include <linux/platform_device.h>
 #include <linux/bcma/bcma.h>
@@ -109,8 +110,7 @@ static int bcm47xxsflash_read(struct mtd_info *mtd, loff_t from, size_t len,
 	if ((from + len) > mtd->size)
 		return -EINVAL;
 
-	memcpy_fromio(buf, (void __iomem *)KSEG0ADDR(b47s->window + from),
-		      len);
+	memcpy_fromio(buf, b47s->window + from, len);
 	*retlen = len;
 
 	return len;
@@ -275,15 +275,33 @@ static void bcm47xxsflash_bcma_cc_write(struct bcm47xxsflash *b47s, u16 offset,
 
 static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 {
-	struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
+	struct device *dev = &pdev->dev;
+	struct bcma_sflash *sflash = dev_get_platdata(dev);
 	struct bcm47xxsflash *b47s;
+	struct resource *res;
 	int err;
 
-	b47s = devm_kzalloc(&pdev->dev, sizeof(*b47s), GFP_KERNEL);
+	b47s = devm_kzalloc(dev, sizeof(*b47s), GFP_KERNEL);
 	if (!b47s)
 		return -ENOMEM;
 	sflash->priv = b47s;
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "invalid resource\n");
+		return -EINVAL;
+	}
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				     res->name)) {
+		dev_err(dev, "can't request region for resource %pR\n", res);
+		return -EBUSY;
+	}
+	b47s->window = ioremap_cache(res->start, resource_size(res));
+	if (!b47s->window) {
+		dev_err(dev, "ioremap failed for resource %pR\n", res);
+		return -ENOMEM;
+	}
+
 	b47s->bcma_cc = container_of(sflash, struct bcma_drv_cc, sflash);
 	b47s->cc_read = bcm47xxsflash_bcma_cc_read;
 	b47s->cc_write = bcm47xxsflash_bcma_cc_write;
@@ -297,7 +315,6 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 		break;
 	}
 
-	b47s->window = sflash->window;
 	b47s->blocksize = sflash->blocksize;
 	b47s->numblocks = sflash->numblocks;
 	b47s->size = sflash->size;
@@ -306,6 +323,7 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 	err = mtd_device_parse_register(&b47s->mtd, probes, NULL, NULL, 0);
 	if (err) {
 		pr_err("Failed to register MTD device: %d\n", err);
+		iounmap(b47s->window);
 		return err;
 	}
 
@@ -321,6 +339,7 @@ static int bcm47xxsflash_bcma_remove(struct platform_device *pdev)
 	struct bcm47xxsflash *b47s = sflash->priv;
 
 	mtd_device_unregister(&b47s->mtd);
+	iounmap(b47s->window);
 
 	return 0;
 }
diff --git a/drivers/mtd/devices/bcm47xxsflash.h b/drivers/mtd/devices/bcm47xxsflash.h
index fe93daf4f489..1564b62b412e 100644
--- a/drivers/mtd/devices/bcm47xxsflash.h
+++ b/drivers/mtd/devices/bcm47xxsflash.h
@@ -65,7 +65,8 @@ struct bcm47xxsflash {
 
 	enum bcm47xxsflash_type type;
 
-	u32 window;
+	void __iomem *window;
+
 	u32 blocksize;
 	u16 numblocks;
 	u32 size;
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 846513c73606..a5ac2cad5cb7 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -587,7 +587,6 @@ struct mtd_info;
 
 struct bcma_sflash {
 	bool present;
-	u32 window;
 	u32 blocksize;
 	u16 numblocks;
 	u32 size;
-- 
cgit v1.2.3


From 5b01e4b9efa0b78672cbbea830c9fbcc7f239e29 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 4 Apr 2016 11:43:54 +0200
Subject: libata: Implement NCQ autosense

Some newer devices support NCQ autosense (cf ACS-4), so we should
be using it to retrieve the sense code and speed up recovery.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c   | 12 ++++++++++++
 drivers/ata/libata-scsi.c |  7 ++++++-
 drivers/ata/libata.h      |  1 +
 include/linux/ata.h       |  2 ++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 961acc788f44..8c8355f0792e 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1600,6 +1600,8 @@ static int ata_eh_read_log_10h(struct ata_device *dev,
 	tf->hob_lbah = buf[10];
 	tf->nsect = buf[12];
 	tf->hob_nsect = buf[13];
+	if (ata_id_has_ncq_autosense(dev->id))
+		tf->auxiliary = buf[14] << 16 | buf[15] << 8 | buf[16];
 
 	return 0;
 }
@@ -1797,6 +1799,16 @@ void ata_eh_analyze_ncq_error(struct ata_link *link)
 	memcpy(&qc->result_tf, &tf, sizeof(tf));
 	qc->result_tf.flags = ATA_TFLAG_ISADDR | ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
 	qc->err_mask |= AC_ERR_DEV | AC_ERR_NCQ;
+	if (qc->result_tf.auxiliary) {
+		char sense_key, asc, ascq;
+
+		sense_key = (qc->result_tf.auxiliary >> 16) & 0xff;
+		asc = (qc->result_tf.auxiliary >> 8) & 0xff;
+		ascq = qc->result_tf.auxiliary & 0xff;
+		ata_scsi_set_sense(qc->scsicmd, sense_key, asc, ascq);
+		qc->flags |= ATA_QCFLAG_SENSE_VALID;
+	}
+
 	ehc->i.err_mask &= ~AC_ERR_DEV;
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 567859ce0512..6dc2fadfd7c5 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -270,8 +270,11 @@ DEVICE_ATTR(unload_heads, S_IRUGO | S_IWUSR,
 	    ata_scsi_park_show, ata_scsi_park_store);
 EXPORT_SYMBOL_GPL(dev_attr_unload_heads);
 
-static void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
+void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
 {
+	if (!cmd)
+		return;
+
 	cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
 
 	scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq);
@@ -1784,6 +1787,8 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 	if (((cdb[0] == ATA_16) || (cdb[0] == ATA_12)) &&
 	    ((cdb[2] & 0x20) || need_sense))
 		ata_gen_passthru_sense(qc);
+	else if (qc->flags & ATA_QCFLAG_SENSE_VALID)
+		cmd->result = SAM_STAT_CHECK_CONDITION;
 	else if (need_sense)
 		ata_gen_ata_sense(qc);
 	else
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index f840ca18a7c0..8cfdd9616d16 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -137,6 +137,7 @@ extern int ata_scsi_add_hosts(struct ata_host *host,
 			      struct scsi_host_template *sht);
 extern void ata_scsi_scan_host(struct ata_port *ap, int sync);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
+extern void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
 extern void ata_scsi_media_change_notify(struct ata_device *dev);
 extern void ata_scsi_hotplug(struct work_struct *work);
 extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index c1a2f345cbe6..e797e1b53006 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -528,6 +528,8 @@ struct ata_bmdma_prd {
 #define ata_id_cdb_intr(id)	(((id)[ATA_ID_CONFIG] & 0x60) == 0x20)
 #define ata_id_has_da(id)	((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4))
 #define ata_id_has_devslp(id)	((id)[ATA_ID_FEATURE_SUPP] & (1 << 8))
+#define ata_id_has_ncq_autosense(id) \
+				((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))
 
 static inline bool ata_id_has_hipm(const u16 *id)
 {
-- 
cgit v1.2.3


From e87fd28cf9a2d9018ac4b6dd92f0b417714bc18d Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 4 Apr 2016 11:43:55 +0200
Subject: libata: Implement support for sense data reporting

ACS-4 defines a sense data reporting feature set.
This patch implements support for it.

tj: Cosmetic formatting updates.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 20 +++++++++++++-
 drivers/ata/libata-eh.c   | 68 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/ata.h       | 16 +++++++++++
 3 files changed, 99 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 55e257c268dd..f991f786227e 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2148,6 +2148,24 @@ static int ata_dev_config_ncq(struct ata_device *dev,
 	return 0;
 }
 
+static void ata_dev_config_sense_reporting(struct ata_device *dev)
+{
+	unsigned int err_mask;
+
+	if (!ata_id_has_sense_reporting(dev->id))
+		return;
+
+	if (ata_id_sense_reporting_enabled(dev->id))
+		return;
+
+	err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA, 0x1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to enable Sense Data Reporting, Emask 0x%x\n",
+			    err_mask);
+	}
+}
+
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@dev: Target device to configure
@@ -2370,7 +2388,7 @@ int ata_dev_configure(struct ata_device *dev)
 					dev->devslp_timing[i] = sata_setting[j];
 				}
 		}
-
+		ata_dev_config_sense_reporting(dev);
 		dev->cdb_len = 16;
 	}
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 8c8355f0792e..170e891e79af 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1637,6 +1637,56 @@ unsigned int atapi_eh_tur(struct ata_device *dev, u8 *r_sense_key)
 	return err_mask;
 }
 
+/**
+ *	ata_eh_request_sense - perform REQUEST_SENSE_DATA_EXT
+ *	@dev: device to perform REQUEST_SENSE_SENSE_DATA_EXT to
+ *	@cmd: scsi command for which the sense code should be set
+ *
+ *	Perform REQUEST_SENSE_DATA_EXT after the device reported CHECK
+ *	SENSE.  This function is an EH helper.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+static void ata_eh_request_sense(struct ata_queued_cmd *qc,
+				 struct scsi_cmnd *cmd)
+{
+	struct ata_device *dev = qc->dev;
+	struct ata_taskfile tf;
+	unsigned int err_mask;
+
+	if (qc->ap->pflags & ATA_PFLAG_FROZEN) {
+		ata_dev_warn(dev, "sense data available but port frozen\n");
+		return;
+	}
+
+	if (!cmd)
+		return;
+
+	if (!ata_id_sense_reporting_enabled(dev->id)) {
+		ata_dev_warn(qc->dev, "sense data reporting disabled\n");
+		return;
+	}
+
+	DPRINTK("ATA request sense\n");
+
+	ata_tf_init(dev, &tf);
+	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
+	tf.flags |= ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
+	tf.command = ATA_CMD_REQ_SENSE_DATA;
+	tf.protocol = ATA_PROT_NODATA;
+
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	/* Ignore err_mask; ATA_ERR might be set */
+	if (tf.command & ATA_SENSE) {
+		ata_scsi_set_sense(cmd, tf.lbah, tf.lbam, tf.lbal);
+		qc->flags |= ATA_QCFLAG_SENSE_VALID;
+	} else {
+		ata_dev_warn(dev, "request sense failed stat %02x emask %x\n",
+			     tf.command, err_mask);
+	}
+}
+
 /**
  *	atapi_eh_request_sense - perform ATAPI REQUEST_SENSE
  *	@dev: device to perform REQUEST_SENSE to
@@ -1838,14 +1888,23 @@ static unsigned int ata_eh_analyze_tf(struct ata_queued_cmd *qc,
 		return ATA_EH_RESET;
 	}
 
-	if (stat & (ATA_ERR | ATA_DF))
+	if (stat & (ATA_ERR | ATA_DF)) {
 		qc->err_mask |= AC_ERR_DEV;
-	else
+		/*
+		 * Sense data reporting does not work if the
+		 * device fault bit is set.
+		 */
+		if (stat & ATA_DF)
+			stat &= ~ATA_SENSE;
+	} else {
 		return 0;
+	}
 
 	switch (qc->dev->class) {
 	case ATA_DEV_ATA:
 	case ATA_DEV_ZAC:
+		if (stat & ATA_SENSE)
+			ata_eh_request_sense(qc, qc->scsicmd);
 		if (err & ATA_ICRC)
 			qc->err_mask |= AC_ERR_ATA_BUS;
 		if (err & (ATA_UNC | ATA_AMNF))
@@ -2581,14 +2640,15 @@ static void ata_eh_link_report(struct ata_link *link)
 
 #ifdef CONFIG_ATA_VERBOSE_ERROR
 		if (res->command & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ |
-				    ATA_ERR)) {
+				    ATA_SENSE | ATA_ERR)) {
 			if (res->command & ATA_BUSY)
 				ata_dev_err(qc->dev, "status: { Busy }\n");
 			else
-				ata_dev_err(qc->dev, "status: { %s%s%s%s}\n",
+				ata_dev_err(qc->dev, "status: { %s%s%s%s%s}\n",
 				  res->command & ATA_DRDY ? "DRDY " : "",
 				  res->command & ATA_DF ? "DF " : "",
 				  res->command & ATA_DRQ ? "DRQ " : "",
+				  res->command & ATA_SENSE ? "SENSE " : "",
 				  res->command & ATA_ERR ? "ERR " : "");
 		}
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index e797e1b53006..00aebc4c83ad 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -385,6 +385,8 @@ enum {
 	SATA_SSP		= 0x06,	/* Software Settings Preservation */
 	SATA_DEVSLP		= 0x09,	/* Device Sleep */
 
+	SETFEATURE_SENSE_DATA	= 0xC3, /* Sense Data Reporting feature */
+
 	/* feature values for SET_MAX */
 	ATA_SET_MAX_ADDR	= 0x00,
 	ATA_SET_MAX_PASSWD	= 0x01,
@@ -718,6 +720,20 @@ static inline bool ata_id_has_read_log_dma_ext(const u16 *id)
 	return false;
 }
 
+static inline bool ata_id_has_sense_reporting(const u16 *id)
+{
+	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
+		return false;
+	return id[ATA_ID_COMMAND_SET_3] & (1 << 6);
+}
+
+static inline bool ata_id_sense_reporting_enabled(const u16 *id)
+{
+	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
+		return false;
+	return id[ATA_ID_COMMAND_SET_4] & (1 << 6);
+}
+
 /**
  *	ata_id_major_version	-	get ATA level of drive
  *	@id: Identify data
-- 
cgit v1.2.3


From 06dbde5f3a44248fc02e24d662ac4849202abb48 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 4 Apr 2016 11:44:03 +0200
Subject: libata: Implement control mode page to select sense format

Implement MODE SELECT for the control mode page to allow the OS
to switch to descriptor sense.

tj: Dropped s/sb/cmd->sense_buffer/ in ata_gen_ata_sense().  Added
    @dev description to ata_msense_ctl_mode().

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c   |   4 +-
 drivers/ata/libata-scsi.c | 116 ++++++++++++++++++++++++++++++++++------------
 drivers/ata/libata.h      |   3 +-
 include/linux/libata.h    |   1 +
 4 files changed, 91 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index e37258b78e01..5b340ce4eeac 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1679,7 +1679,7 @@ static void ata_eh_request_sense(struct ata_queued_cmd *qc,
 	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
 	/* Ignore err_mask; ATA_ERR might be set */
 	if (tf.command & ATA_SENSE) {
-		ata_scsi_set_sense(cmd, tf.lbah, tf.lbam, tf.lbal);
+		ata_scsi_set_sense(dev, cmd, tf.lbah, tf.lbam, tf.lbal);
 		qc->flags |= ATA_QCFLAG_SENSE_VALID;
 	} else {
 		ata_dev_warn(dev, "request sense failed stat %02x emask %x\n",
@@ -1855,7 +1855,7 @@ void ata_eh_analyze_ncq_error(struct ata_link *link)
 		sense_key = (qc->result_tf.auxiliary >> 16) & 0xff;
 		asc = (qc->result_tf.auxiliary >> 8) & 0xff;
 		ascq = qc->result_tf.auxiliary & 0xff;
-		ata_scsi_set_sense(qc->scsicmd, sense_key, asc, ascq);
+		ata_scsi_set_sense(dev, qc->scsicmd, sense_key, asc, ascq);
 		ata_scsi_set_sense_information(dev, qc->scsicmd,
 					       &qc->result_tf);
 		qc->flags |= ATA_QCFLAG_SENSE_VALID;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 0da03c019f27..2389247bdf6f 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -270,14 +270,17 @@ DEVICE_ATTR(unload_heads, S_IRUGO | S_IWUSR,
 	    ata_scsi_park_show, ata_scsi_park_store);
 EXPORT_SYMBOL_GPL(dev_attr_unload_heads);
 
-void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
+void ata_scsi_set_sense(struct ata_device *dev, struct scsi_cmnd *cmd,
+			u8 sk, u8 asc, u8 ascq)
 {
+	bool d_sense = (dev->flags & ATA_DFLAG_D_SENSE);
+
 	if (!cmd)
 		return;
 
 	cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
 
-	scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq);
+	scsi_build_sense_buffer(d_sense, cmd->sense_buffer, sk, asc, ascq);
 }
 
 void ata_scsi_set_sense_information(struct ata_device *dev,
@@ -384,9 +387,10 @@ struct device_attribute *ata_common_sdev_attrs[] = {
 };
 EXPORT_SYMBOL_GPL(ata_common_sdev_attrs);
 
-static void ata_scsi_invalid_field(struct scsi_cmnd *cmd)
+static void ata_scsi_invalid_field(struct ata_device *dev,
+				   struct scsi_cmnd *cmd)
 {
-	ata_scsi_set_sense(cmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	cmd->scsi_done(cmd);
 }
@@ -1014,7 +1018,7 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc)
 	    tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
 		ata_to_sense_error(qc->ap->print_id, tf->command, tf->feature,
 				   &sense_key, &asc, &ascq, verbose);
-		ata_scsi_set_sense(cmd, sense_key, asc, ascq);
+		ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq);
 	} else {
 		/*
 		 * ATA PASS-THROUGH INFORMATION AVAILABLE
@@ -1112,12 +1116,12 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
 	    tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
 		ata_to_sense_error(qc->ap->print_id, tf->command, tf->feature,
 				   &sense_key, &asc, &ascq, verbose);
-		ata_scsi_set_sense(cmd, sense_key, asc, ascq);
+		ata_scsi_set_sense(dev, cmd, sense_key, asc, ascq);
 	} else {
 		/* Could not decode error */
 		ata_dev_warn(dev, "could not decode error status 0x%x err_mask 0x%x\n",
 			     tf->command, qc->err_mask);
-		ata_scsi_set_sense(cmd, ABORTED_COMMAND, 0, 0);
+		ata_scsi_set_sense(dev, cmd, ABORTED_COMMAND, 0, 0);
 		return;
 	}
 
@@ -1440,7 +1444,7 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
 	return 0;
 
  invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
  skip:
@@ -1679,12 +1683,12 @@ static unsigned int ata_scsi_verify_xlat(struct ata_queued_cmd *qc)
 	return 0;
 
 invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
 
 out_of_range:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x21, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x0);
 	/* "Logical Block Address out of range" */
 	return 1;
 
@@ -1781,12 +1785,12 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc)
 		goto out_of_range;
 	/* treat all other errors as -EINVAL, fall through */
 invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
 
 out_of_range:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x21, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x0);
 	/* "Logical Block Address out of range" */
 	return 1;
 
@@ -2358,6 +2362,7 @@ static unsigned int ata_msense_caching(u16 *id, u8 *buf, bool changeable)
 
 /**
  *	ata_msense_ctl_mode - Simulate MODE SENSE control mode page
+ *	@dev: ATA device of interest
  *	@buf: output buffer
  *	@changeable: whether changeable parameters are requested
  *
@@ -2366,9 +2371,12 @@ static unsigned int ata_msense_caching(u16 *id, u8 *buf, bool changeable)
  *	LOCKING:
  *	None.
  */
-static unsigned int ata_msense_ctl_mode(u8 *buf, bool changeable)
+static unsigned int ata_msense_ctl_mode(struct ata_device *dev, u8 *buf,
+					bool changeable)
 {
 	modecpy(buf, def_control_mpage, sizeof(def_control_mpage), changeable);
+	if (changeable && (dev->flags & ATA_DFLAG_D_SENSE))
+		buf[2] |= (1 << 2);	/* Descriptor sense requested */
 	return sizeof(def_control_mpage);
 }
 
@@ -2482,13 +2490,13 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
 		break;
 
 	case CONTROL_MPAGE:
-		p += ata_msense_ctl_mode(p, page_control == 1);
+		p += ata_msense_ctl_mode(args->dev, p, page_control == 1);
 		break;
 
 	case ALL_MPAGES:
 		p += ata_msense_rw_recovery(p, page_control == 1);
 		p += ata_msense_caching(args->id, p, page_control == 1);
-		p += ata_msense_ctl_mode(p, page_control == 1);
+		p += ata_msense_ctl_mode(args->dev, p, page_control == 1);
 		break;
 
 	default:		/* invalid page code */
@@ -2521,12 +2529,12 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 
 invalid_fld:
-	ata_scsi_set_sense(args->cmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(dev, args->cmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
 
 saving_not_supp:
-	ata_scsi_set_sense(args->cmd, ILLEGAL_REQUEST, 0x39, 0x0);
+	ata_scsi_set_sense(dev, args->cmd, ILLEGAL_REQUEST, 0x39, 0x0);
 	 /* "Saving parameters not supported" */
 	return 1;
 }
@@ -3163,7 +3171,7 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
 	return 0;
 
  invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00);
+	ata_scsi_set_sense(dev, scmd, ILLEGAL_REQUEST, 0x24, 0x00);
 	/* "Invalid field in cdb" */
 	return 1;
 }
@@ -3228,7 +3236,7 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 	return 0;
 
  invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00);
+	ata_scsi_set_sense(dev, scmd, ILLEGAL_REQUEST, 0x24, 0x00);
 	/* "Invalid field in cdb" */
 	return 1;
 }
@@ -3279,6 +3287,51 @@ static int ata_mselect_caching(struct ata_queued_cmd *qc,
 	return 0;
 }
 
+/**
+ *	ata_mselect_control - Simulate MODE SELECT for control page
+ *	@qc: Storage for translated ATA taskfile
+ *	@buf: input buffer
+ *	@len: number of valid bytes in the input buffer
+ *
+ *	Prepare a taskfile to modify caching information for the device.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static int ata_mselect_control(struct ata_queued_cmd *qc,
+			       const u8 *buf, int len)
+{
+	struct ata_device *dev = qc->dev;
+	char mpage[CONTROL_MPAGE_LEN];
+	u8 d_sense;
+
+	/*
+	 * The first two bytes of def_control_mpage are a header, so offsets
+	 * in mpage are off by 2 compared to buf.  Same for len.
+	 */
+
+	if (len != CONTROL_MPAGE_LEN - 2)
+		return -EINVAL;
+
+	d_sense = buf[0] & (1 << 2);
+
+	/*
+	 * Check that read-only bits are not modified.
+	 */
+	ata_msense_ctl_mode(dev, mpage, false);
+	mpage[2] &= ~(1 << 2);
+	mpage[2] |= d_sense;
+	if (memcmp(mpage + 2, buf, CONTROL_MPAGE_LEN - 2) != 0)
+		return -EINVAL;
+	if (d_sense & (1 << 2))
+		dev->flags |= ATA_DFLAG_D_SENSE;
+	else
+		dev->flags &= ~ATA_DFLAG_D_SENSE;
+	qc->scsicmd->result = SAM_STAT_GOOD;
+	qc->scsicmd->scsi_done(qc->scsicmd);
+	return 0;
+}
+
 /**
  *	ata_scsiop_mode_select - Simulate MODE SELECT 6, 10 commands
  *	@qc: Storage for translated ATA taskfile
@@ -3381,7 +3434,10 @@ static unsigned int ata_scsi_mode_select_xlat(struct ata_queued_cmd *qc)
 		if (ata_mselect_caching(qc, p, pg_len) < 0)
 			goto invalid_param;
 		break;
-
+	case CONTROL_MPAGE:
+		if (ata_mselect_control(qc, p, pg_len) < 0)
+			goto invalid_param;
+		break;
 	default:		/* invalid page code */
 		goto invalid_param;
 	}
@@ -3397,17 +3453,17 @@ static unsigned int ata_scsi_mode_select_xlat(struct ata_queued_cmd *qc)
 
  invalid_fld:
 	/* "Invalid field in CDB" */
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	return 1;
 
  invalid_param:
 	/* "Invalid field in parameter list" */
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x26, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x26, 0x0);
 	return 1;
 
  invalid_param_len:
 	/* "Parameter list length error" */
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
 	return 1;
 
  skip:
@@ -3611,12 +3667,12 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 	switch(scsicmd[0]) {
 	/* TODO: worth improving? */
 	case FORMAT_UNIT:
-		ata_scsi_invalid_field(cmd);
+		ata_scsi_invalid_field(dev, cmd);
 		break;
 
 	case INQUIRY:
 		if (scsicmd[1] & 2)	           /* is CmdDt set?  */
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 		else if ((scsicmd[1] & 1) == 0)    /* is EVPD clear? */
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_std);
 		else switch (scsicmd[2]) {
@@ -3642,7 +3698,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
 			break;
 		default:
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 			break;
 		}
 		break;
@@ -3660,7 +3716,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		if ((scsicmd[1] & 0x1f) == SAI_READ_CAPACITY_16)
 			ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
 		else
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 		break;
 
 	case REPORT_LUNS:
@@ -3668,7 +3724,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		break;
 
 	case REQUEST_SENSE:
-		ata_scsi_set_sense(cmd, 0, 0, 0);
+		ata_scsi_set_sense(dev, cmd, 0, 0, 0);
 		cmd->result = (DRIVER_SENSE << 24);
 		cmd->scsi_done(cmd);
 		break;
@@ -3692,12 +3748,12 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		if ((tmp8 == 0x4) && (!scsicmd[3]) && (!scsicmd[4]))
 			ata_scsi_rbuf_fill(&args, ata_scsiop_noop);
 		else
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 		break;
 
 	/* all other commands */
 	default:
-		ata_scsi_set_sense(cmd, ILLEGAL_REQUEST, 0x20, 0x0);
+		ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0);
 		/* "Invalid command operation code" */
 		cmd->scsi_done(cmd);
 		break;
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index dbc67604b3c5..3b301a48007c 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -138,7 +138,8 @@ extern int ata_scsi_add_hosts(struct ata_host *host,
 			      struct scsi_host_template *sht);
 extern void ata_scsi_scan_host(struct ata_port *ap, int sync);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
-extern void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
+extern void ata_scsi_set_sense(struct ata_device *dev,
+			       struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
 extern void ata_scsi_set_sense_information(struct ata_device *dev,
 					   struct scsi_cmnd *cmd,
 					   const struct ata_taskfile *tf);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2c4ebef79d0c..a418bca0df0d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -180,6 +180,7 @@ enum {
 	ATA_DFLAG_DA		= (1 << 26), /* device supports Device Attention */
 	ATA_DFLAG_DEVSLP	= (1 << 27), /* device supports Device Sleep */
 	ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */
+	ATA_DFLAG_D_SENSE	= (1 << 29), /* Descriptor sense requested */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
-- 
cgit v1.2.3


From 77ed2c5745d93416992857d124f35834b62b3e70 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 8 Mar 2016 20:01:32 +0900
Subject: android,lowmemorykiller: Don't abuse TIF_MEMDIE.

Currently, lowmemorykiller (LMK) is using TIF_MEMDIE for two purposes.
One is to remember processes killed by LMK, and the other is to
accelerate termination of processes killed by LMK.

But since LMK is invoked as a memory shrinker function, there still
should be some memory available. It is very likely that memory
allocations by processes killed by LMK will succeed without using
ALLOC_NO_WATERMARKS via TIF_MEMDIE. Even if their allocations cannot
escape from memory allocation loop unless they use ALLOC_NO_WATERMARKS,
lowmem_deathpending_timeout can guarantee forward progress by choosing
next victim process.

On the other hand, mark_oom_victim() assumes that it must be called with
oom_lock held and it must not be called after oom_killer_disable() was
called. But LMK is calling it without holding oom_lock and checking
oom_killer_disabled. It is possible that LMK calls mark_oom_victim()
due to allocation requests by kernel threads after current thread
returned from oom_killer_disabled(). This will break synchronization
for PM/suspend.

This patch introduces per a task_struct flag for remembering processes
killed by LMK, and replaces TIF_MEMDIE with that flag. By applying this
patch, assumption by mark_oom_victim() becomes true.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Arve Hjonnevag <arve@android.com>
Cc: Riley Andrews <riandrews@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/lowmemorykiller.c | 9 ++-------
 include/linux/sched.h                     | 4 ++++
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 2509e5df7244..c79f22425fa8 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -131,7 +131,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 		if (!p)
 			continue;
 
-		if (test_tsk_thread_flag(p, TIF_MEMDIE) &&
+		if (task_lmk_waiting(p) && p->mm &&
 		    time_before_eq(jiffies, lowmem_deathpending_timeout)) {
 			task_unlock(p);
 			rcu_read_unlock();
@@ -162,13 +162,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 	if (selected) {
 		task_lock(selected);
 		send_sig(SIGKILL, selected, 0);
-		/*
-		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
-		 * infrastructure. There is no real reason why the selected
-		 * task should have access to the memory reserves.
-		 */
 		if (selected->mm)
-			mark_oom_victim(selected);
+			task_set_lmk_waiting(selected);
 		task_unlock(selected);
 		lowmem_print(1, "Killing '%s' (%d), adj %hd,\n"
 				 "   to free %ldkB on behalf of '%s' (%d) because\n"
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 60bba7e032dc..9dff190e6a0a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2184,6 +2184,7 @@ static inline void memalloc_noio_restore(unsigned int flags)
 #define PFA_NO_NEW_PRIVS 0	/* May not gain new privileges. */
 #define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
 #define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
+#define PFA_LMK_WAITING  3      /* Lowmemorykiller is waiting */
 
 
 #define TASK_PFA_TEST(name, func)					\
@@ -2207,6 +2208,9 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
 TASK_PFA_SET(SPREAD_SLAB, spread_slab)
 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 
+TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
+TASK_PFA_SET(LMK_WAITING, lmk_waiting)
+
 /*
  * task->jobctl flags
  */
-- 
cgit v1.2.3


From 4da56b99d99e5a7df2b7f11e87bfea935f909732 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 4 Apr 2016 14:46:42 +0100
Subject: mm/vmap: Add a notifier for when we run out of vmap address space

vmaps are temporary kernel mappings that may be of long duration.
Reusing a vmap on an object is preferrable for a driver as the cost of
setting up the vmap can otherwise dominate the operation on the object.
However, the vmap address space is rather limited on 32bit systems and
so we add a notification for vmap pressure in order for the driver to
release any cached vmappings.

The interface is styled after the oom-notifier where the callees are
passed a pointer to an unsigned long counter for them to indicate if they
have freed any space.

v2: Guard the blocking notifier call with gfpflags_allow_blocking()
v3: Correct typo in forward declaration and move to head of file

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Peniaev <r.peniaev@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Andrew Morton <akpm@linux-foundation.org> # for inclusion via DRM
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1459777603-23618-3-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 include/linux/vmalloc.h |  4 ++++
 mm/vmalloc.c            | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d1f1d338af20..8b51df3ab334 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -8,6 +8,7 @@
 #include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
+struct notifier_block;		/* in notifier.h */
 
 /* bits in flags of vmalloc's vm_struct below */
 #define VM_IOREMAP		0x00000001	/* ioremap() and friends */
@@ -187,4 +188,7 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 #define VMALLOC_TOTAL 0UL
 #endif
 
+int register_vmap_purge_notifier(struct notifier_block *nb);
+int unregister_vmap_purge_notifier(struct notifier_block *nb);
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bffe47..12d27ac303ae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -21,6 +21,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
 #include <linux/list.h>
+#include <linux/notifier.h>
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
@@ -344,6 +345,8 @@ static void __insert_vmap_area(struct vmap_area *va)
 
 static void purge_vmap_area_lazy(void);
 
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
@@ -363,6 +366,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	BUG_ON(offset_in_page(size));
 	BUG_ON(!is_power_of_2(align));
 
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
 	va = kmalloc_node(sizeof(struct vmap_area),
 			gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!va))
@@ -468,6 +473,16 @@ overflow:
 		purged = 1;
 		goto retry;
 	}
+
+	if (gfpflags_allow_blocking(gfp_mask)) {
+		unsigned long freed = 0;
+		blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+		if (freed > 0) {
+			purged = 0;
+			goto retry;
+		}
+	}
+
 	if (printk_ratelimit())
 		pr_warn("vmap allocation for size %lu failed: "
 			"use vmalloc=<size> to increase size.\n", size);
@@ -475,6 +490,18 @@ overflow:
 	return ERR_PTR(-EBUSY);
 }
 
+int register_vmap_purge_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
+
+int unregister_vmap_purge_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
+
 static void __free_vmap_area(struct vmap_area *va)
 {
 	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
-- 
cgit v1.2.3


From f9cd476123ced488e628339becedb2cf3243a58a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 4 Apr 2016 22:44:59 +0200
Subject: dmaengine: pl08x: allocate OF slave channel data at probe time

The current OF translation of channels can never work with
any DMA client using the DMA channels directly: the only way
to get the channels initialized properly is in the
dma_async_device_register() call, where chan->dev etc is
allocated and initialized.

Allocate and initialize all possible DMA channels and
only augment a target channel with the periph_buses at
of_xlate(). Remove some const settings to make things work.

Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Tested-by: Joachim Eastwood <manabian@gmail.com>
Tested-by: Johannes Stezenbach <js@sig21.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/amba-pl08x.c   | 86 +++++++++++++++++++++++++++++++---------------
 include/linux/amba/pl08x.h |  2 +-
 2 files changed, 59 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 9b42c0588550..81db1c4811ce 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -107,16 +107,20 @@ struct pl08x_driver_data;
 /**
  * struct vendor_data - vendor-specific config parameters for PL08x derivatives
  * @channels: the number of channels available in this variant
+ * @signals: the number of request signals available from the hardware
  * @dualmaster: whether this version supports dual AHB masters or not.
  * @nomadik: whether the channels have Nomadik security extension bits
  *	that need to be checked for permission before use and some registers are
  *	missing
  * @pl080s: whether this version is a PL080S, which has separate register and
  *	LLI word for transfer size.
+ * @max_transfer_size: the maximum single element transfer size for this
+ *	PL08x variant.
  */
 struct vendor_data {
 	u8 config_offset;
 	u8 channels;
+	u8 signals;
 	bool dualmaster;
 	bool nomadik;
 	bool pl080s;
@@ -235,7 +239,7 @@ struct pl08x_dma_chan {
 	struct virt_dma_chan vc;
 	struct pl08x_phy_chan *phychan;
 	const char *name;
-	const struct pl08x_channel_data *cd;
+	struct pl08x_channel_data *cd;
 	struct dma_slave_config cfg;
 	struct pl08x_txd *at;
 	struct pl08x_driver_data *host;
@@ -1909,6 +1913,12 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x,
 
 		if (slave) {
 			chan->cd = &pl08x->pd->slave_channels[i];
+			/*
+			 * Some implementations have muxed signals, whereas some
+			 * use a mux in front of the signals and need dynamic
+			 * assignment of signals.
+			 */
+			chan->signal = i;
 			pl08x_dma_slave_init(chan);
 		} else {
 			chan->cd = &pl08x->pd->memcpy_channel;
@@ -2050,40 +2060,33 @@ static struct dma_chan *pl08x_of_xlate(struct of_phandle_args *dma_spec,
 				       struct of_dma *ofdma)
 {
 	struct pl08x_driver_data *pl08x = ofdma->of_dma_data;
-	struct pl08x_channel_data *data;
-	struct pl08x_dma_chan *chan;
 	struct dma_chan *dma_chan;
+	struct pl08x_dma_chan *plchan;
 
 	if (!pl08x)
 		return NULL;
 
-	if (dma_spec->args_count != 2)
+	if (dma_spec->args_count != 2) {
+		dev_err(&pl08x->adev->dev,
+			"DMA channel translation requires two cells\n");
 		return NULL;
+	}
 
 	dma_chan = pl08x_find_chan_id(pl08x, dma_spec->args[0]);
-	if (dma_chan)
-		return dma_get_slave_channel(dma_chan);
-
-	chan = devm_kzalloc(pl08x->slave.dev, sizeof(*chan) + sizeof(*data),
-			    GFP_KERNEL);
-	if (!chan)
+	if (!dma_chan) {
+		dev_err(&pl08x->adev->dev,
+			"DMA slave channel not found\n");
 		return NULL;
+	}
 
-	data = (void *)&chan[1];
-	data->bus_id = "(none)";
-	data->periph_buses = dma_spec->args[1];
-
-	chan->cd = data;
-	chan->host = pl08x;
-	chan->slave = true;
-	chan->name = data->bus_id;
-	chan->state = PL08X_CHAN_IDLE;
-	chan->signal = dma_spec->args[0];
-	chan->vc.desc_free = pl08x_desc_free;
-
-	vchan_init(&chan->vc, &pl08x->slave);
+	plchan = to_pl08x_chan(dma_chan);
+	dev_dbg(&pl08x->adev->dev,
+		"translated channel for signal %d\n",
+		dma_spec->args[0]);
 
-	return dma_get_slave_channel(&chan->vc.chan);
+	/* Augment channel data for applicable AHB buses */
+	plchan->cd->periph_buses = dma_spec->args[1];
+	return dma_get_slave_channel(dma_chan);
 }
 
 static int pl08x_of_probe(struct amba_device *adev,
@@ -2091,9 +2094,11 @@ static int pl08x_of_probe(struct amba_device *adev,
 			  struct device_node *np)
 {
 	struct pl08x_platform_data *pd;
+	struct pl08x_channel_data *chanp = NULL;
 	u32 cctl_memcpy = 0;
 	u32 val;
 	int ret;
+	int i;
 
 	pd = devm_kzalloc(&adev->dev, sizeof(*pd), GFP_KERNEL);
 	if (!pd)
@@ -2195,6 +2200,27 @@ static int pl08x_of_probe(struct amba_device *adev,
 	/* Use the buses that can access memory, obviously */
 	pd->memcpy_channel.periph_buses = pd->mem_buses;
 
+	/*
+	 * Allocate channel data for all possible slave channels (one
+	 * for each possible signal), channels will then be allocated
+	 * for a device and have it's AHB interfaces set up at
+	 * translation time.
+	 */
+	chanp = devm_kcalloc(&adev->dev,
+			pl08x->vd->signals,
+			sizeof(struct pl08x_channel_data),
+			GFP_KERNEL);
+	if (!chanp)
+		return -ENOMEM;
+
+	pd->slave_channels = chanp;
+	for (i = 0; i < pl08x->vd->signals; i++) {
+		/* chanp->periph_buses will be assigned at translation */
+		chanp->bus_id = kasprintf(GFP_KERNEL, "slave%d", i);
+		chanp++;
+	}
+	pd->num_slave_channels = pl08x->vd->signals;
+
 	pl08x->pd = pd;
 
 	return of_dma_controller_register(adev->dev.of_node, pl08x_of_xlate,
@@ -2234,6 +2260,10 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 		goto out_no_pl08x;
 	}
 
+	/* Assign useful pointers to the driver state */
+	pl08x->adev = adev;
+	pl08x->vd = vd;
+
 	/* Initialize memcpy engine */
 	dma_cap_set(DMA_MEMCPY, pl08x->memcpy.cap_mask);
 	pl08x->memcpy.dev = &adev->dev;
@@ -2284,10 +2314,6 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 		}
 	}
 
-	/* Assign useful pointers to the driver state */
-	pl08x->adev = adev;
-	pl08x->vd = vd;
-
 	/* By default, AHB1 only.  If dualmaster, from platform */
 	pl08x->lli_buses = PL08X_AHB1;
 	pl08x->mem_buses = PL08X_AHB1;
@@ -2438,6 +2464,7 @@ out_no_pl08x:
 static struct vendor_data vendor_pl080 = {
 	.config_offset = PL080_CH_CONFIG,
 	.channels = 8,
+	.signals = 16,
 	.dualmaster = true,
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
 };
@@ -2445,6 +2472,7 @@ static struct vendor_data vendor_pl080 = {
 static struct vendor_data vendor_nomadik = {
 	.config_offset = PL080_CH_CONFIG,
 	.channels = 8,
+	.signals = 32,
 	.dualmaster = true,
 	.nomadik = true,
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
@@ -2453,6 +2481,7 @@ static struct vendor_data vendor_nomadik = {
 static struct vendor_data vendor_pl080s = {
 	.config_offset = PL080S_CH_CONFIG,
 	.channels = 8,
+	.signals = 32,
 	.pl080s = true,
 	.max_transfer_size = PL080S_CONTROL_TRANSFER_SIZE_MASK,
 };
@@ -2460,6 +2489,7 @@ static struct vendor_data vendor_pl080s = {
 static struct vendor_data vendor_pl081 = {
 	.config_offset = PL080_CH_CONFIG,
 	.channels = 2,
+	.signals = 16,
 	.dualmaster = false,
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
 };
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 10fe2a211c2e..27e9ec8778eb 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -86,7 +86,7 @@ struct pl08x_channel_data {
  * @mem_buses: buses which memory can be accessed from: PL08X_AHB1 | PL08X_AHB2
  */
 struct pl08x_platform_data {
-	const struct pl08x_channel_data *slave_channels;
+	struct pl08x_channel_data *slave_channels;
 	unsigned int num_slave_channels;
 	struct pl08x_channel_data memcpy_channel;
 	int (*get_xfer_signal)(const struct pl08x_channel_data *);
-- 
cgit v1.2.3


From fa59507f720077a856c9952a31cfd45cd97ef6f9 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 30 Mar 2016 12:56:28 +0300
Subject: usb: otg-fsm: Add documentation for struct otg_fsm

struct otg_fsm is the interface to the OTG state machine.

Document the input, output and internal state variables.
Definations are taken from Table 7-2 and Table 7-4 of
the USB OTG & EH Specification Rev.2.0

Re-arrange some of the members as per use case for more
clarity.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 include/linux/usb/otg-fsm.h | 90 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 24198e16f849..8eec0c261be5 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -72,37 +72,113 @@ enum otg_fsm_timer {
 	NUM_OTG_FSM_TIMERS,
 };
 
-/* OTG state machine according to the OTG spec */
+/**
+ * struct otg_fsm - OTG state machine according to the OTG spec
+ *
+ * OTG hardware Inputs
+ *
+ *	Common inputs for A and B device
+ * @id:		TRUE for B-device, FALSE for A-device.
+ * @adp_change: TRUE when current ADP measurement (n) value, compared to the
+ *		ADP measurement taken at n-2, differs by more than CADP_THR
+ * @power_up:	TRUE when the OTG device first powers up its USB system and
+ *		ADP measurement taken if ADP capable
+ *
+ *	A-Device state inputs
+ * @a_srp_det:	TRUE if the A-device detects SRP
+ * @a_vbus_vld:	TRUE when VBUS voltage is in regulation
+ * @b_conn:	TRUE if the A-device detects connection from the B-device
+ * @a_bus_resume: TRUE when the B-device detects that the A-device is signaling
+ *		  a resume (K state)
+ *	B-Device state inputs
+ * @a_bus_suspend: TRUE when the B-device detects that the A-device has put the
+ *		bus into suspend
+ * @a_conn:	TRUE if the B-device detects a connection from the A-device
+ * @b_se0_srp:	TRUE when the line has been at SE0 for more than the minimum
+ *		time before generating SRP
+ * @b_ssend_srp: TRUE when the VBUS has been below VOTG_SESS_VLD for more than
+ *		 the minimum time before generating SRP
+ * @b_sess_vld:	TRUE when the B-device detects that the voltage on VBUS is
+ *		above VOTG_SESS_VLD
+ * @test_device: TRUE when the B-device switches to B-Host and detects an OTG
+ *		test device. This must be set by host/hub driver
+ *
+ *	Application inputs (A-Device)
+ * @a_bus_drop:	TRUE when A-device application needs to power down the bus
+ * @a_bus_req:	TRUE when A-device application wants to use the bus.
+ *		FALSE to suspend the bus
+ *
+ *	Application inputs (B-Device)
+ * @b_bus_req:	TRUE during the time that the Application running on the
+ *		B-device wants to use the bus
+ *
+ *	Auxilary inputs (OTG v1.3 only. Obsolete now.)
+ * @a_sess_vld:	TRUE if the A-device detects that VBUS is above VA_SESS_VLD
+ * @b_bus_suspend: TRUE when the A-device detects that the B-device has put
+ *		the bus into suspend
+ * @b_bus_resume: TRUE when the A-device detects that the B-device is signaling
+ *		 resume on the bus
+ *
+ * OTG Output status. Read only for users. Updated by OTG FSM helpers defined
+ * in this file
+ *
+ *	Outputs for Both A and B device
+ * @drv_vbus:	TRUE when A-device is driving VBUS
+ * @loc_conn:	TRUE when the local device has signaled that it is connected
+ *		to the bus
+ * @loc_sof:	TRUE when the local device is generating activity on the bus
+ * @adp_prb:	TRUE when the local device is in the process of doing
+ *		ADP probing
+ *
+ *	Outputs for B-device state
+ * @adp_sns:	TRUE when the B-device is in the process of carrying out
+ *		ADP sensing
+ * @data_pulse: TRUE when the B-device is performing data line pulsing
+ *
+ * Internal Variables
+ *
+ * a_set_b_hnp_en: TRUE when the A-device has successfully set the
+ *		b_hnp_enable bit in the B-device.
+ *		   Unused as OTG fsm uses otg->host->b_hnp_enable instead
+ * b_srp_done:	TRUE when the B-device has completed initiating SRP
+ * b_hnp_enable: TRUE when the B-device has accepted the
+ *		SetFeature(b_hnp_enable) B-device.
+ *		Unused as OTG fsm uses otg->gadget->b_hnp_enable instead
+ * a_clr_err:	Asserted (by application ?) to clear a_vbus_err due to an
+ *		overcurrent condition and causes the A-device to transition
+ *		to a_wait_vfall
+ */
 struct otg_fsm {
 	/* Input */
 	int id;
 	int adp_change;
 	int power_up;
-	int test_device;
-	int a_bus_drop;
-	int a_bus_req;
 	int a_srp_det;
 	int a_vbus_vld;
 	int b_conn;
 	int a_bus_resume;
 	int a_bus_suspend;
 	int a_conn;
-	int b_bus_req;
 	int b_se0_srp;
 	int b_ssend_srp;
 	int b_sess_vld;
+	int test_device;
+	int a_bus_drop;
+	int a_bus_req;
+	int b_bus_req;
+
 	/* Auxilary inputs */
 	int a_sess_vld;
 	int b_bus_resume;
 	int b_bus_suspend;
 
 	/* Output */
-	int data_pulse;
 	int drv_vbus;
 	int loc_conn;
 	int loc_sof;
 	int adp_prb;
 	int adp_sns;
+	int data_pulse;
 
 	/* Internal variables */
 	int a_set_b_hnp_en;
@@ -110,7 +186,7 @@ struct otg_fsm {
 	int b_hnp_enable;
 	int a_clr_err;
 
-	/* Informative variables */
+	/* Informative variables. All unused as of now */
 	int a_bus_drop_inf;
 	int a_bus_req_inf;
 	int a_clr_err_inf;
-- 
cgit v1.2.3


From 4e332df63487418ec512c3c376c07df9ab3ae035 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 30 Mar 2016 12:56:29 +0300
Subject: usb: otg-fsm: support multiple instances

Move the state_changed variable into struct otg_fsm
so that we can support multiple instances.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/common/usb-otg-fsm.c | 10 ++++------
 include/linux/usb/otg-fsm.h      |  1 +
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c
index 504708f59b93..9059b7dc185e 100644
--- a/drivers/usb/common/usb-otg-fsm.c
+++ b/drivers/usb/common/usb-otg-fsm.c
@@ -61,8 +61,6 @@ static int otg_set_protocol(struct otg_fsm *fsm, int protocol)
 	return 0;
 }
 
-static int state_changed;
-
 /* Called when leaving a state.  Do state clean up jobs here */
 static void otg_leave_state(struct otg_fsm *fsm, enum usb_otg_state old_state)
 {
@@ -208,7 +206,6 @@ static void otg_start_hnp_polling(struct otg_fsm *fsm)
 /* Called when entering a state */
 static int otg_set_state(struct otg_fsm *fsm, enum usb_otg_state new_state)
 {
-	state_changed = 1;
 	if (fsm->otg->state == new_state)
 		return 0;
 	VDBG("Set state: %s\n", usb_otg_state_string(new_state));
@@ -324,6 +321,7 @@ static int otg_set_state(struct otg_fsm *fsm, enum usb_otg_state new_state)
 	}
 
 	fsm->otg->state = new_state;
+	fsm->state_changed = 1;
 	return 0;
 }
 
@@ -335,7 +333,7 @@ int otg_statemachine(struct otg_fsm *fsm)
 	mutex_lock(&fsm->lock);
 
 	state = fsm->otg->state;
-	state_changed = 0;
+	fsm->state_changed = 0;
 	/* State machine state change judgement */
 
 	switch (state) {
@@ -448,7 +446,7 @@ int otg_statemachine(struct otg_fsm *fsm)
 	}
 	mutex_unlock(&fsm->lock);
 
-	VDBG("quit statemachine, changed = %d\n", state_changed);
-	return state_changed;
+	VDBG("quit statemachine, changed = %d\n", fsm->state_changed);
+	return fsm->state_changed;
 }
 EXPORT_SYMBOL_GPL(otg_statemachine);
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 8eec0c261be5..7a0350535cb1 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -210,6 +210,7 @@ struct otg_fsm {
 	struct mutex lock;
 	u8 *host_req_flag;
 	struct delayed_work hnp_polling_work;
+	bool state_changed;
 };
 
 struct otg_fsm_ops {
-- 
cgit v1.2.3


From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Generalise system_verify_data() to provide access to internal
 content

Generalise system_verify_data() to provide access to internal content
through a callback.  This allows all the PKCS#7 stuff to be hidden inside
this function and removed from the PE file parser and the PKCS#7 test key.

If external content is not required, NULL should be passed as data to the
function.  If the callback is not required, that can be set to NULL.

The function is now called verify_pkcs7_signature() to contrast with
verify_pefile_signature() and the definitions of both have been moved into
linux/verification.h along with the key_being_used_for enum.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 arch/x86/kernel/kexec-bzimage64.c       | 18 ++-------
 certs/system_keyring.c                  | 45 ++++++++++++++++-----
 crypto/asymmetric_keys/Kconfig          |  4 +-
 crypto/asymmetric_keys/mscode_parser.c  | 21 ++++------
 crypto/asymmetric_keys/pkcs7_key_type.c | 72 +++++++++++++--------------------
 crypto/asymmetric_keys/pkcs7_parser.c   | 21 +++++-----
 crypto/asymmetric_keys/verify_pefile.c  | 40 +++++-------------
 crypto/asymmetric_keys/verify_pefile.h  |  5 +--
 include/crypto/pkcs7.h                  |  3 +-
 include/crypto/public_key.h             | 14 -------
 include/keys/asymmetric-type.h          |  1 +
 include/keys/system_keyring.h           |  7 +---
 include/linux/verification.h            | 50 +++++++++++++++++++++++
 include/linux/verify_pefile.h           | 22 ----------
 kernel/module_signing.c                 |  5 ++-
 15 files changed, 155 insertions(+), 173 deletions(-)
 create mode 100644 include/linux/verification.h
 delete mode 100644 include/linux/verify_pefile.h

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2af478e3fd4e..f2356bda2b05 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -19,8 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/efi.h>
-#include <linux/verify_pefile.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
 
 #include <asm/bootparam.h>
 #include <asm/setup.h>
@@ -529,18 +528,9 @@ static int bzImage64_cleanup(void *loader_data)
 #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
 static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 {
-	bool trusted;
-	int ret;
-
-	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring,
-				      VERIFYING_KEXEC_PE_SIGNATURE,
-				      &trusted);
-	if (ret < 0)
-		return ret;
-	if (!trusted)
-		return -EKEYREJECTED;
-	return 0;
+	return verify_pefile_signature(kernel, kernel_len,
+				       NULL,
+				       VERIFYING_KEXEC_PE_SIGNATURE);
 }
 #endif
 
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index f4180326c2e1..a83bffedc0aa 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -108,16 +108,25 @@ late_initcall(load_system_certificate_list);
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 
 /**
- * Verify a PKCS#7-based signature on system data.
- * @data: The data to be verified.
+ * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data.
+ * @data: The data to be verified (NULL if expecting internal data).
  * @len: Size of @data.
  * @raw_pkcs7: The PKCS#7 message that is the signature.
  * @pkcs7_len: The size of @raw_pkcs7.
+ * @trusted_keys: Trusted keys to use (NULL for system_trusted_keyring).
  * @usage: The use to which the key is being put.
+ * @view_content: Callback to gain access to content.
+ * @ctx: Context for callback.
  */
-int system_verify_data(const void *data, unsigned long len,
-		       const void *raw_pkcs7, size_t pkcs7_len,
-		       enum key_being_used_for usage)
+int verify_pkcs7_signature(const void *data, size_t len,
+			   const void *raw_pkcs7, size_t pkcs7_len,
+			   struct key *trusted_keys,
+			   int untrusted_error,
+			   enum key_being_used_for usage,
+			   int (*view_content)(void *ctx,
+					       const void *data, size_t len,
+					       size_t asn1hdrlen),
+			   void *ctx)
 {
 	struct pkcs7_message *pkcs7;
 	bool trusted;
@@ -128,7 +137,7 @@ int system_verify_data(const void *data, unsigned long len,
 		return PTR_ERR(pkcs7);
 
 	/* The data should be detached - so we need to supply it. */
-	if (pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
+	if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
 		pr_err("PKCS#7 signature with non-detached data\n");
 		ret = -EBADMSG;
 		goto error;
@@ -138,13 +147,29 @@ int system_verify_data(const void *data, unsigned long len,
 	if (ret < 0)
 		goto error;
 
-	ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
+	if (!trusted_keys)
+		trusted_keys = system_trusted_keyring;
+	ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted);
 	if (ret < 0)
 		goto error;
 
-	if (!trusted) {
+	if (!trusted && untrusted_error) {
 		pr_err("PKCS#7 signature not signed with a trusted key\n");
-		ret = -ENOKEY;
+		ret = untrusted_error;
+		goto error;
+	}
+
+	if (view_content) {
+		size_t asn1hdrlen;
+
+		ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen);
+		if (ret < 0) {
+			if (ret == -ENODATA)
+				pr_devel("PKCS#7 message does not contain data\n");
+			goto error;
+		}
+
+		ret = view_content(ctx, data, len, asn1hdrlen);
 	}
 
 error:
@@ -152,6 +177,6 @@ error:
 	pr_devel("<==%s() = %d\n", __func__, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(system_verify_data);
+EXPORT_SYMBOL_GPL(verify_pkcs7_signature);
 
 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig
index 91a7e047a765..f7d2ef9789d8 100644
--- a/crypto/asymmetric_keys/Kconfig
+++ b/crypto/asymmetric_keys/Kconfig
@@ -40,8 +40,7 @@ config PKCS7_MESSAGE_PARSER
 
 config PKCS7_TEST_KEY
 	tristate "PKCS#7 testing key type"
-	depends on PKCS7_MESSAGE_PARSER
-	select SYSTEM_TRUSTED_KEYRING
+	depends on SYSTEM_DATA_VERIFICATION
 	help
 	  This option provides a type of key that can be loaded up from a
 	  PKCS#7 message - provided the message is signed by a trusted key.  If
@@ -54,6 +53,7 @@ config PKCS7_TEST_KEY
 config SIGNED_PE_FILE_VERIFICATION
 	bool "Support for PE file signature verification"
 	depends on PKCS7_MESSAGE_PARSER=y
+	depends on SYSTEM_DATA_VERIFICATION
 	select ASN1
 	select OID_REGISTRY
 	help
diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 3242cbfaeaa2..6a76d5c70ef6 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -21,19 +21,13 @@
 /*
  * Parse a Microsoft Individual Code Signing blob
  */
-int mscode_parse(struct pefile_context *ctx)
+int mscode_parse(void *_ctx, const void *content_data, size_t data_len,
+		 size_t asn1hdrlen)
 {
-	const void *content_data;
-	size_t data_len;
-	int ret;
-
-	ret = pkcs7_get_content_data(ctx->pkcs7, &content_data, &data_len, 1);
-
-	if (ret) {
-		pr_debug("PKCS#7 message does not contain data\n");
-		return ret;
-	}
+	struct pefile_context *ctx = _ctx;
 
+	content_data -= asn1hdrlen;
+	data_len += asn1hdrlen;
 	pr_devel("Data: %zu [%*ph]\n", data_len, (unsigned)(data_len),
 		 content_data);
 
@@ -129,7 +123,6 @@ int mscode_note_digest(void *context, size_t hdrlen,
 {
 	struct pefile_context *ctx = context;
 
-	ctx->digest = value;
-	ctx->digest_len = vlen;
-	return 0;
+	ctx->digest = kmemdup(value, vlen, GFP_KERNEL);
+	return ctx->digest ? 0 : -ENOMEM;
 }
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index e2d0edbbc71a..ab9bf5363ecd 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -13,12 +13,9 @@
 #include <linux/key.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/verification.h>
 #include <linux/key-type.h>
-#include <keys/asymmetric-type.h>
-#include <crypto/pkcs7.h>
 #include <keys/user-type.h>
-#include <keys/system_keyring.h>
-#include "pkcs7_parser.h"
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("PKCS#7 testing key type");
@@ -29,59 +26,46 @@ MODULE_PARM_DESC(pkcs7_usage,
 		 "Usage to specify when verifying the PKCS#7 message");
 
 /*
- * Preparse a PKCS#7 wrapped and validated data blob.
+ * Retrieve the PKCS#7 message content.
  */
-static int pkcs7_preparse(struct key_preparsed_payload *prep)
+static int pkcs7_view_content(void *ctx, const void *data, size_t len,
+			      size_t asn1hdrlen)
 {
-	enum key_being_used_for usage = pkcs7_usage;
-	struct pkcs7_message *pkcs7;
-	const void *data, *saved_prep_data;
-	size_t datalen, saved_prep_datalen;
-	bool trusted;
+	struct key_preparsed_payload *prep = ctx;
+	const void *saved_prep_data;
+	size_t saved_prep_datalen;
 	int ret;
 
-	kenter("");
-
-	if (usage >= NR__KEY_BEING_USED_FOR) {
-		pr_err("Invalid usage type %d\n", usage);
-		return -EINVAL;
-	}
-
 	saved_prep_data = prep->data;
 	saved_prep_datalen = prep->datalen;
-	pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen);
-	if (IS_ERR(pkcs7)) {
-		ret = PTR_ERR(pkcs7);
-		goto error;
-	}
-
-	ret = pkcs7_verify(pkcs7, usage);
-	if (ret < 0)
-		goto error_free;
-
-	ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
-	if (ret < 0)
-		goto error_free;
-	if (!trusted)
-		pr_warn("PKCS#7 message doesn't chain back to a trusted key\n");
-
-	ret = pkcs7_get_content_data(pkcs7, &data, &datalen, false);
-	if (ret < 0)
-		goto error_free;
-
 	prep->data = data;
-	prep->datalen = datalen;
+	prep->datalen = len;
+
 	ret = user_preparse(prep);
+
 	prep->data = saved_prep_data;
 	prep->datalen = saved_prep_datalen;
-
-error_free:
-	pkcs7_free_message(pkcs7);
-error:
-	kleave(" = %d", ret);
 	return ret;
 }
 
+/*
+ * Preparse a PKCS#7 wrapped and validated data blob.
+ */
+static int pkcs7_preparse(struct key_preparsed_payload *prep)
+{
+	enum key_being_used_for usage = pkcs7_usage;
+
+	if (usage >= NR__KEY_BEING_USED_FOR) {
+		pr_err("Invalid usage type %d\n", usage);
+		return -EINVAL;
+	}
+
+	return verify_pkcs7_signature(NULL, 0,
+				      prep->data, prep->datalen,
+				      NULL, -ENOKEY, usage,
+				      pkcs7_view_content, prep);
+}
+
 /*
  * user defined keys take an arbitrary string as the description and an
  * arbitrary blob of data as the payload
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index 835701613125..af4cd8649117 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -168,24 +168,25 @@ EXPORT_SYMBOL_GPL(pkcs7_parse_message);
  * @pkcs7: The preparsed PKCS#7 message to access
  * @_data: Place to return a pointer to the data
  * @_data_len: Place to return the data length
- * @want_wrapper: True if the ASN.1 object header should be included in the data
+ * @_headerlen: Size of ASN.1 header not included in _data
  *
- * Get access to the data content of the PKCS#7 message, including, optionally,
- * the header of the ASN.1 object that contains it.  Returns -ENODATA if the
- * data object was missing from the message.
+ * Get access to the data content of the PKCS#7 message.  The size of the
+ * header of the ASN.1 object that contains it is also provided and can be used
+ * to adjust *_data and *_data_len to get the entire object.
+ *
+ * Returns -ENODATA if the data object was missing from the message.
  */
 int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
 			   const void **_data, size_t *_data_len,
-			   bool want_wrapper)
+			   size_t *_headerlen)
 {
-	size_t wrapper;
-
 	if (!pkcs7->data)
 		return -ENODATA;
 
-	wrapper = want_wrapper ? pkcs7->data_hdrlen : 0;
-	*_data = pkcs7->data - wrapper;
-	*_data_len = pkcs7->data_len + wrapper;
+	*_data = pkcs7->data;
+	*_data_len = pkcs7->data_len;
+	if (_headerlen)
+		*_headerlen = pkcs7->data_hdrlen;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pkcs7_get_content_data);
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 7e8c2338ae25..265351075b0e 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -16,7 +16,7 @@
 #include <linux/err.h>
 #include <linux/pe.h>
 #include <linux/asn1.h>
-#include <crypto/pkcs7.h>
+#include <linux/verification.h>
 #include <crypto/hash.h>
 #include "verify_pefile.h"
 
@@ -392,9 +392,8 @@ error_no_desc:
  * verify_pefile_signature - Verify the signature on a PE binary image
  * @pebuf: Buffer containing the PE binary image
  * @pelen: Length of the binary image
- * @trust_keyring: Signing certificates to use as starting points
+ * @trust_keys: Signing certificate(s) to use as starting points
  * @usage: The use to which the key is being put.
- * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message inside the PE
  * binary image intersects keys we already know and trust.
@@ -418,14 +417,10 @@ error_no_desc:
  * May also return -ENOMEM.
  */
 int verify_pefile_signature(const void *pebuf, unsigned pelen,
-			    struct key *trusted_keyring,
-			    enum key_being_used_for usage,
-			    bool *_trusted)
+			    struct key *trusted_keys,
+			    enum key_being_used_for usage)
 {
-	struct pkcs7_message *pkcs7;
 	struct pefile_context ctx;
-	const void *data;
-	size_t datalen;
 	int ret;
 
 	kenter("");
@@ -439,19 +434,10 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	if (ret < 0)
 		return ret;
 
-	pkcs7 = pkcs7_parse_message(pebuf + ctx.sig_offset, ctx.sig_len);
-	if (IS_ERR(pkcs7))
-		return PTR_ERR(pkcs7);
-	ctx.pkcs7 = pkcs7;
-
-	ret = pkcs7_get_content_data(ctx.pkcs7, &data, &datalen, false);
-	if (ret < 0 || datalen == 0) {
-		pr_devel("PKCS#7 message does not contain data\n");
-		ret = -EBADMSG;
-		goto error;
-	}
-
-	ret = mscode_parse(&ctx);
+	ret = verify_pkcs7_signature(NULL, 0,
+				     pebuf + ctx.sig_offset, ctx.sig_len,
+				     trusted_keys, -EKEYREJECTED, usage,
+				     mscode_parse, &ctx);
 	if (ret < 0)
 		goto error;
 
@@ -462,16 +448,8 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	 * contents.
 	 */
 	ret = pefile_digest_pe(pebuf, pelen, &ctx);
-	if (ret < 0)
-		goto error;
-
-	ret = pkcs7_verify(pkcs7, usage);
-	if (ret < 0)
-		goto error;
-
-	ret = pkcs7_validate_trust(pkcs7, trusted_keyring, _trusted);
 
 error:
-	pkcs7_free_message(ctx.pkcs7);
+	kfree(ctx.digest);
 	return ret;
 }
diff --git a/crypto/asymmetric_keys/verify_pefile.h b/crypto/asymmetric_keys/verify_pefile.h
index a133eb81a492..cd4d20930728 100644
--- a/crypto/asymmetric_keys/verify_pefile.h
+++ b/crypto/asymmetric_keys/verify_pefile.h
@@ -9,7 +9,6 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
-#include <linux/verify_pefile.h>
 #include <crypto/pkcs7.h>
 #include <crypto/hash_info.h>
 
@@ -23,7 +22,6 @@ struct pefile_context {
 	unsigned	sig_offset;
 	unsigned	sig_len;
 	const struct section_header *secs;
-	struct pkcs7_message *pkcs7;
 
 	/* PKCS#7 MS Individual Code Signing content */
 	const void	*digest;		/* Digest */
@@ -39,4 +37,5 @@ struct pefile_context {
 /*
  * mscode_parser.c
  */
-extern int mscode_parse(struct pefile_context *ctx);
+extern int mscode_parse(void *_ctx, const void *content_data, size_t data_len,
+			size_t asn1hdrlen);
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 441aff9b5aa7..8323e3e57131 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -12,6 +12,7 @@
 #ifndef _CRYPTO_PKCS7_H
 #define _CRYPTO_PKCS7_H
 
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 
 struct key;
@@ -26,7 +27,7 @@ extern void pkcs7_free_message(struct pkcs7_message *pkcs7);
 
 extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
 				  const void **_data, size_t *_datalen,
-				  bool want_wrapper);
+				  size_t *_headerlen);
 
 /*
  * pkcs7_trust.c
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 2f5de5c1a3a0..b3928e801b8c 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -14,20 +14,6 @@
 #ifndef _LINUX_PUBLIC_KEY_H
 #define _LINUX_PUBLIC_KEY_H
 
-/*
- * The use to which an asymmetric key is being put.
- */
-enum key_being_used_for {
-	VERIFYING_MODULE_SIGNATURE,
-	VERIFYING_FIRMWARE_SIGNATURE,
-	VERIFYING_KEXEC_PE_SIGNATURE,
-	VERIFYING_KEY_SIGNATURE,
-	VERIFYING_KEY_SELF_SIGNATURE,
-	VERIFYING_UNSPECIFIED_SIGNATURE,
-	NR__KEY_BEING_USED_FOR
-};
-extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
-
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
  *
diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h
index 70a8775bb444..d1e23dda4363 100644
--- a/include/keys/asymmetric-type.h
+++ b/include/keys/asymmetric-type.h
@@ -15,6 +15,7 @@
 #define _KEYS_ASYMMETRIC_TYPE_H
 
 #include <linux/key-type.h>
+#include <linux/verification.h>
 
 extern struct key_type key_type_asymmetric;
 
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 39fd38cfa8c9..b2d645ac35a0 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -15,6 +15,7 @@
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
 #include <linux/key.h>
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 
 extern struct key *system_trusted_keyring;
@@ -29,12 +30,6 @@ static inline struct key *get_system_trusted_keyring(void)
 }
 #endif
 
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-extern int system_verify_data(const void *data, unsigned long len,
-			      const void *raw_pkcs7, size_t pkcs7_len,
-			      enum key_being_used_for usage);
-#endif
-
 #ifdef CONFIG_IMA_MOK_KEYRING
 extern struct key *ima_mok_keyring;
 extern struct key *ima_blacklist_keyring;
diff --git a/include/linux/verification.h b/include/linux/verification.h
new file mode 100644
index 000000000000..bb0fcf941cb7
--- /dev/null
+++ b/include/linux/verification.h
@@ -0,0 +1,50 @@
+/* Signature verification
+ *
+ * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_VERIFICATION_H
+#define _LINUX_VERIFICATION_H
+
+/*
+ * The use to which an asymmetric key is being put.
+ */
+enum key_being_used_for {
+	VERIFYING_MODULE_SIGNATURE,
+	VERIFYING_FIRMWARE_SIGNATURE,
+	VERIFYING_KEXEC_PE_SIGNATURE,
+	VERIFYING_KEY_SIGNATURE,
+	VERIFYING_KEY_SELF_SIGNATURE,
+	VERIFYING_UNSPECIFIED_SIGNATURE,
+	NR__KEY_BEING_USED_FOR
+};
+extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+
+struct key;
+
+extern int verify_pkcs7_signature(const void *data, size_t len,
+				  const void *raw_pkcs7, size_t pkcs7_len,
+				  struct key *trusted_keys,
+				  int untrusted_error,
+				  enum key_being_used_for usage,
+				  int (*view_content)(void *ctx,
+						      const void *data, size_t len,
+						      size_t asn1hdrlen),
+				  void *ctx);
+
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
+				   struct key *trusted_keys,
+				   enum key_being_used_for usage);
+#endif
+
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+#endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/include/linux/verify_pefile.h b/include/linux/verify_pefile.h
deleted file mode 100644
index da2049b5161c..000000000000
--- a/include/linux/verify_pefile.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Signed PE file verification
- *
- * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _LINUX_VERIFY_PEFILE_H
-#define _LINUX_VERIFY_PEFILE_H
-
-#include <crypto/public_key.h>
-
-extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
-				   struct key *trusted_keyring,
-				   enum key_being_used_for usage,
-				   bool *_trusted);
-
-#endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 64b9dead4a07..593aace88a02 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len,
-				  VERIFYING_MODULE_SIGNATURE);
+	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, NULL);
 }
-- 
cgit v1.2.3


From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: PKCS#7: Make trust determination dependent on contents of trust
 keyring

Make the determination of the trustworthiness of a key dependent on whether
a key that can verify it is present in the supplied ring of trusted keys
rather than whether or not the verifying key has KEY_FLAG_TRUSTED set.

verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust
chain cannot be verified.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 certs/system_keyring.c                  | 13 ++++---------
 crypto/asymmetric_keys/pkcs7_key_type.c |  2 +-
 crypto/asymmetric_keys/pkcs7_parser.h   |  1 -
 crypto/asymmetric_keys/pkcs7_trust.c    | 18 +++---------------
 crypto/asymmetric_keys/verify_pefile.c  |  2 +-
 crypto/asymmetric_keys/x509_parser.h    |  1 -
 include/crypto/pkcs7.h                  |  3 +--
 include/linux/verification.h            |  1 -
 kernel/module_signing.c                 |  2 +-
 9 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index a83bffedc0aa..dc18869ff680 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -121,7 +121,6 @@ late_initcall(load_system_certificate_list);
 int verify_pkcs7_signature(const void *data, size_t len,
 			   const void *raw_pkcs7, size_t pkcs7_len,
 			   struct key *trusted_keys,
-			   int untrusted_error,
 			   enum key_being_used_for usage,
 			   int (*view_content)(void *ctx,
 					       const void *data, size_t len,
@@ -129,7 +128,6 @@ int verify_pkcs7_signature(const void *data, size_t len,
 			   void *ctx)
 {
 	struct pkcs7_message *pkcs7;
-	bool trusted;
 	int ret;
 
 	pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len);
@@ -149,13 +147,10 @@ int verify_pkcs7_signature(const void *data, size_t len,
 
 	if (!trusted_keys)
 		trusted_keys = system_trusted_keyring;
-	ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted);
-	if (ret < 0)
-		goto error;
-
-	if (!trusted && untrusted_error) {
-		pr_err("PKCS#7 signature not signed with a trusted key\n");
-		ret = untrusted_error;
+	ret = pkcs7_validate_trust(pkcs7, trusted_keys);
+	if (ret < 0) {
+		if (ret == -ENOKEY)
+			pr_err("PKCS#7 signature not signed with a trusted key\n");
 		goto error;
 	}
 
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index ab9bf5363ecd..3b92523882e5 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -62,7 +62,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
 	return verify_pkcs7_signature(NULL, 0,
 				      prep->data, prep->datalen,
-				      NULL, -ENOKEY, usage,
+				      NULL, usage,
 				      pkcs7_view_content, prep);
 }
 
diff --git a/crypto/asymmetric_keys/pkcs7_parser.h b/crypto/asymmetric_keys/pkcs7_parser.h
index d5eec31e95b6..f4e81074f5e0 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.h
+++ b/crypto/asymmetric_keys/pkcs7_parser.h
@@ -22,7 +22,6 @@ struct pkcs7_signed_info {
 	struct pkcs7_signed_info *next;
 	struct x509_certificate *signer; /* Signing certificate (in msg->certs) */
 	unsigned	index;
-	bool		trusted;
 	bool		unsupported_crypto;	/* T if not usable due to missing crypto */
 
 	/* Message digest - the digest of the Content Data (or NULL) */
diff --git a/crypto/asymmetric_keys/pkcs7_trust.c b/crypto/asymmetric_keys/pkcs7_trust.c
index b9a5487cd82d..36e77cb07bd0 100644
--- a/crypto/asymmetric_keys/pkcs7_trust.c
+++ b/crypto/asymmetric_keys/pkcs7_trust.c
@@ -30,7 +30,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 	struct public_key_signature *sig = sinfo->sig;
 	struct x509_certificate *x509, *last = NULL, *p;
 	struct key *key;
-	bool trusted;
 	int ret;
 
 	kenter(",%u,", sinfo->index);
@@ -42,10 +41,8 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 
 	for (x509 = sinfo->signer; x509; x509 = x509->signer) {
 		if (x509->seen) {
-			if (x509->verified) {
-				trusted = x509->trusted;
+			if (x509->verified)
 				goto verified;
-			}
 			kleave(" = -ENOKEY [cached]");
 			return -ENOKEY;
 		}
@@ -122,7 +119,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 
 matched:
 	ret = verify_signature(key, sig);
-	trusted = test_bit(KEY_FLAG_TRUSTED, &key->flags);
 	key_put(key);
 	if (ret < 0) {
 		if (ret == -ENOMEM)
@@ -134,12 +130,9 @@ matched:
 verified:
 	if (x509) {
 		x509->verified = true;
-		for (p = sinfo->signer; p != x509; p = p->signer) {
+		for (p = sinfo->signer; p != x509; p = p->signer)
 			p->verified = true;
-			p->trusted = trusted;
-		}
 	}
-	sinfo->trusted = trusted;
 	kleave(" = 0");
 	return 0;
 }
@@ -148,7 +141,6 @@ verified:
  * pkcs7_validate_trust - Validate PKCS#7 trust chain
  * @pkcs7: The PKCS#7 certificate to validate
  * @trust_keyring: Signing certificates to use as starting points
- * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message intersects
  * keys we already know and trust.
@@ -170,16 +162,13 @@ verified:
  * May also return -ENOMEM.
  */
 int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
-			 struct key *trust_keyring,
-			 bool *_trusted)
+			 struct key *trust_keyring)
 {
 	struct pkcs7_signed_info *sinfo;
 	struct x509_certificate *p;
 	int cached_ret = -ENOKEY;
 	int ret;
 
-	*_trusted = false;
-
 	for (p = pkcs7->certs; p; p = p->next)
 		p->seen = false;
 
@@ -193,7 +182,6 @@ int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
 				cached_ret = -ENOPKG;
 			continue;
 		case 0:
-			*_trusted |= sinfo->trusted;
 			cached_ret = 0;
 			continue;
 		default:
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 265351075b0e..672a94c2c3ff 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -436,7 +436,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 
 	ret = verify_pkcs7_signature(NULL, 0,
 				     pebuf + ctx.sig_offset, ctx.sig_len,
-				     trusted_keys, -EKEYREJECTED, usage,
+				     trusted_keys, usage,
 				     mscode_parse, &ctx);
 	if (ret < 0)
 		goto error;
diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h
index f24f4d808e7f..05eef1c68881 100644
--- a/crypto/asymmetric_keys/x509_parser.h
+++ b/crypto/asymmetric_keys/x509_parser.h
@@ -39,7 +39,6 @@ struct x509_certificate {
 	unsigned	index;
 	bool		seen;			/* Infinite recursion prevention */
 	bool		verified;
-	bool		trusted;
 	bool		self_signed;		/* T if self-signed (check unsupported_sig too) */
 	bool		unsupported_key;	/* T if key uses unsupported crypto */
 	bool		unsupported_sig;	/* T if signature uses unsupported crypto */
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 8323e3e57131..583f199400a3 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -33,8 +33,7 @@ extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
  * pkcs7_trust.c
  */
 extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
-				struct key *trust_keyring,
-				bool *_trusted);
+				struct key *trust_keyring);
 
 /*
  * pkcs7_verify.c
diff --git a/include/linux/verification.h b/include/linux/verification.h
index bb0fcf941cb7..a10549a6c7cd 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -33,7 +33,6 @@ struct key;
 extern int verify_pkcs7_signature(const void *data, size_t len,
 				  const void *raw_pkcs7, size_t pkcs7_len,
 				  struct key *trusted_keys,
-				  int untrusted_error,
 				  enum key_being_used_for usage,
 				  int (*view_content)(void *ctx,
 						      const void *data, size_t len,
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 593aace88a02..6a64e03b9f44 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-- 
cgit v1.2.3


From 31e6850e0fdb3a586363cc4d2f9801cdf9374310 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 5 Apr 2016 12:39:30 +0100
Subject: iommu: Add MMIO mapping type

On some platforms, MMIO regions might need slightly different treatment
compared to mapping regular memory; add the notion of MMIO mappings to
the IOMMU API's memory type flags, so that callers can let the IOMMU
drivers know to do the right thing.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a5c539fa5d2b..34b643227df1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -30,6 +30,7 @@
 #define IOMMU_WRITE	(1 << 1)
 #define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
 #define IOMMU_NOEXEC	(1 << 3)
+#define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
 
 struct iommu_ops;
 struct iommu_group;
-- 
cgit v1.2.3


From 1d111406c6d91f4d7f6cc69a43e59546e8010aae Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sun, 20 Mar 2016 13:57:20 +0100
Subject: PCI: Add Intel Thunderbolt device IDs

Intel Gen 1 and 2 chips use the same ID for NHI, bridges and switch.  Gen 3
chips and onward use a distinct ID for the NHI.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Andreas Noever <andreas.noever@gmail.com>
---
 drivers/pci/quirks.c         | 16 ++++++++++------
 drivers/thunderbolt/nhi.c    |  8 +++++---
 drivers/thunderbolt/switch.c |  9 +++++----
 include/linux/pci_ids.h      | 18 ++++++++++++++++++
 4 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 8e678027b900..b584ddf83555 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3232,7 +3232,8 @@ static void quirk_apple_poweroff_thunderbolt(struct pci_dev *dev)
 	acpi_execute_simple_method(SXIO, NULL, 0);
 	acpi_execute_simple_method(SXLV, NULL, 0);
 }
-DECLARE_PCI_FIXUP_SUSPEND_LATE(PCI_VENDOR_ID_INTEL, 0x1547,
+DECLARE_PCI_FIXUP_SUSPEND_LATE(PCI_VENDOR_ID_INTEL,
+			       PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C,
 			       quirk_apple_poweroff_thunderbolt);
 
 /*
@@ -3266,9 +3267,10 @@ static void quirk_apple_wait_for_thunderbolt(struct pci_dev *dev)
 	if (!nhi)
 		goto out;
 	if (nhi->vendor != PCI_VENDOR_ID_INTEL
-			|| (nhi->device != 0x1547 && nhi->device != 0x156c)
-			|| nhi->subsystem_vendor != 0x2222
-			|| nhi->subsystem_device != 0x1111)
+		    || (nhi->device != PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C &&
+			nhi->device != PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_NHI)
+		    || nhi->subsystem_vendor != 0x2222
+		    || nhi->subsystem_device != 0x1111)
 		goto out;
 	dev_info(&dev->dev, "quirk: waiting for thunderbolt to reestablish PCI tunnels...\n");
 	device_pm_wait_for_dev(&dev->dev, &nhi->dev);
@@ -3276,9 +3278,11 @@ out:
 	pci_dev_put(nhi);
 	pci_dev_put(sibling);
 }
-DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL, 0x1547,
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL,
+			       PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C,
 			       quirk_apple_wait_for_thunderbolt);
-DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL, 0x156d,
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL,
+			       PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_BRIDGE,
 			       quirk_apple_wait_for_thunderbolt);
 #endif
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 20a41f7de76f..36be23babb89 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -633,16 +633,18 @@ static const struct dev_pm_ops nhi_pm_ops = {
 static struct pci_device_id nhi_ids[] = {
 	/*
 	 * We have to specify class, the TB bridges use the same device and
-	 * vendor (sub)id.
+	 * vendor (sub)id on gen 1 and gen 2 controllers.
 	 */
 	{
 		.class = PCI_CLASS_SYSTEM_OTHER << 8, .class_mask = ~0,
-		.vendor = PCI_VENDOR_ID_INTEL, .device = 0x1547,
+		.vendor = PCI_VENDOR_ID_INTEL,
+		.device = PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C,
 		.subvendor = 0x2222, .subdevice = 0x1111,
 	},
 	{
 		.class = PCI_CLASS_SYSTEM_OTHER << 8, .class_mask = ~0,
-		.vendor = PCI_VENDOR_ID_INTEL, .device = 0x156c,
+		.vendor = PCI_VENDOR_ID_INTEL,
+		.device = PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_NHI,
 		.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID,
 	},
 	{ 0,}
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index aeb982969629..db73ffed68a9 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -293,9 +293,9 @@ static int tb_plug_events_active(struct tb_switch *sw, bool active)
 	if (active) {
 		data = data & 0xFFFFFF83;
 		switch (sw->config.device_id) {
-		case 0x1513:
-		case 0x151a:
-		case 0x1549:
+		case PCI_DEVICE_ID_INTEL_LIGHT_RIDGE:
+		case PCI_DEVICE_ID_INTEL_EAGLE_RIDGE:
+		case PCI_DEVICE_ID_INTEL_PORT_RIDGE:
 			break;
 		default:
 			data |= 4;
@@ -370,7 +370,8 @@ struct tb_switch *tb_switch_alloc(struct tb *tb, u64 route)
 		tb_sw_warn(sw, "unknown switch vendor id %#x\n",
 			   sw->config.vendor_id);
 
-	if (sw->config.device_id != 0x1547 && sw->config.device_id != 0x1549)
+	if (sw->config.device_id != PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C &&
+	    sw->config.device_id != PCI_DEVICE_ID_INTEL_PORT_RIDGE)
 		tb_sw_warn(sw, "unsupported switch device id %#x\n",
 			   sw->config.device_id);
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 247da8c95860..c58752fe16c4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2604,6 +2604,24 @@
 #define PCI_DEVICE_ID_INTEL_82441	0x1237
 #define PCI_DEVICE_ID_INTEL_82380FB	0x124b
 #define PCI_DEVICE_ID_INTEL_82439	0x1250
+#define PCI_DEVICE_ID_INTEL_LIGHT_RIDGE             0x1513 /* Tbt 1 Gen 1 */
+#define PCI_DEVICE_ID_INTEL_EAGLE_RIDGE             0x151a
+#define PCI_DEVICE_ID_INTEL_LIGHT_PEAK              0x151b
+#define PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C         0x1547 /* Tbt 1 Gen 2 */
+#define PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_2C         0x1548
+#define PCI_DEVICE_ID_INTEL_PORT_RIDGE              0x1549
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_2C_NHI    0x1566 /* Tbt 1 Gen 3 */
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_2C_BRIDGE 0x1567
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_4C_NHI    0x1568
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_4C_BRIDGE 0x1569
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_2C_NHI     0x156a /* Thunderbolt 2 */
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_2C_BRIDGE  0x156b
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_NHI     0x156c
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_BRIDGE  0x156d
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_2C_NHI     0x1575 /* Thunderbolt 3 */
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_2C_BRIDGE  0x1576
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_NHI     0x1577
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_BRIDGE  0x1578
 #define PCI_DEVICE_ID_INTEL_80960_RP	0x1960
 #define PCI_DEVICE_ID_INTEL_82840_HB	0x1a21
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
-- 
cgit v1.2.3


From 32b9b10961860860268961d9aad0c56a73018c37 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 11 Feb 2016 13:19:09 -0800
Subject: clk: Allow clocks to be marked as CRITICAL

Critical clocks are those which must not be gated, else undefined
or catastrophic failure would occur.  Here we have chosen to
ensure the prepare/enable counts are correctly incremented, so as
not to confuse users with enabled clocks with no visible users.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/1455225554-13267-2-git-send-email-mturquette@baylibre.com
---
 drivers/clk/clk.c            | 5 +++++
 include/linux/clk-provider.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index fb74dc1f7520..275201fd7b01 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2397,6 +2397,11 @@ static int __clk_core_init(struct clk_core *core)
 	if (core->ops->init)
 		core->ops->init(core->hw);
 
+	if (core->flags & CLK_IS_CRITICAL) {
+		clk_core_prepare(core);
+		clk_core_enable(core);
+	}
+
 	kref_init(&core->ref);
 out:
 	clk_prepare_unlock();
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index da95258127aa..0638b4154502 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -32,6 +32,7 @@
 #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */
 #define CLK_RECALC_NEW_RATES	BIT(9) /* recalc rates after notifications */
 #define CLK_SET_RATE_UNGATE	BIT(10) /* clock needs to run to set rate */
+#define CLK_IS_CRITICAL		BIT(11) /* do not gate, ever */
 
 struct clk;
 struct clk_hw;
-- 
cgit v1.2.3


From d56f8994b6fb928f59481fabc25bcd1c2f9bd06d Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 11 Feb 2016 13:19:11 -0800
Subject: clk: Provide OF helper to mark clocks as CRITICAL

This call matches clocks which have been marked as critical in DT
and sets the appropriate flag.  These flags can then be used to
mark the clock core flags appropriately prior to registration.

Legacy bindings requiring this feature must add the clock-critical
property to their binding descriptions, as it is not a part of
common-clock binding.

Cc: devicetree@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/1455225554-13267-4-git-send-email-mturquette@baylibre.com
---
 drivers/clk/clk.c            | 35 +++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  8 +++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index dede0ce679e4..9f77cc67cdc3 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3137,6 +3137,41 @@ static int parent_ready(struct device_node *np)
 	}
 }
 
+/**
+ * of_clk_detect_critical() - set CLK_IS_CRITICAL flag from Device Tree
+ * @np: Device node pointer associated with clock provider
+ * @index: clock index
+ * @flags: pointer to clk_core->flags
+ *
+ * Detects if the clock-critical property exists and, if so, sets the
+ * corresponding CLK_IS_CRITICAL flag.
+ *
+ * Do not use this function. It exists only for legacy Device Tree
+ * bindings, such as the one-clock-per-node style that are outdated.
+ * Those bindings typically put all clock data into .dts and the Linux
+ * driver has no clock data, thus making it impossible to set this flag
+ * correctly from the driver. Only those drivers may call
+ * of_clk_detect_critical from their setup functions.
+ *
+ * Return: error code or zero on success
+ */
+int of_clk_detect_critical(struct device_node *np,
+					  int index, unsigned long *flags)
+{
+	struct property *prop;
+	const __be32 *cur;
+	uint32_t idx;
+
+	if (!np || !flags)
+		return -EINVAL;
+
+	of_property_for_each_u32(np, "clock-critical", prop, cur, idx)
+		if (index == idx)
+			*flags |= CLK_IS_CRITICAL;
+
+	return 0;
+}
+
 /**
  * of_clk_init() - Scan and init clock providers from the DT
  * @matches: array of compatible values and init functions for providers.
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0638b4154502..156286445a25 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -721,7 +721,8 @@ unsigned int of_clk_get_parent_count(struct device_node *np);
 int of_clk_parent_fill(struct device_node *np, const char **parents,
 		       unsigned int size);
 const char *of_clk_get_parent_name(struct device_node *np, int index);
-
+int of_clk_detect_critical(struct device_node *np, int index,
+			    unsigned long *flags);
 void of_clk_init(const struct of_device_id *matches);
 
 #else /* !CONFIG_OF */
@@ -758,6 +759,11 @@ static inline const char *of_clk_get_parent_name(struct device_node *np,
 {
 	return NULL;
 }
+static inline int of_clk_detect_critical(struct device_node *np, int index,
+					  unsigned long *flags)
+{
+	return 0;
+}
 static inline void of_clk_init(const struct of_device_id *matches) {}
 #endif /* CONFIG_OF */
 
-- 
cgit v1.2.3


From 3c9d6296b7aee536a96ea2b53a15d23511738c1c Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 8 Apr 2016 12:20:30 +0200
Subject: security: drop the unused hook skb_owned_by

The skb_owned_by hook was added with the commit ca10b9e9a8ca
("selinux: add a skb_owned_by() hook") and later removed
when said commit was reverted.

Later on, when switching to list of hooks, a field named
'skb_owned_by' was included into the security_hook_head struct,
but without any users nor caller.

This commit removes the said left-over field.

Fixes: b1d9e6b0646d ("LSM: Switch to lists of hooks")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Paul Moore <pmoore@paul-moore.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h | 1 -
 security/security.c       | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index cdee11cbcdf1..ae2537886177 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1804,7 +1804,6 @@ struct security_hook_heads {
 	struct list_head tun_dev_attach_queue;
 	struct list_head tun_dev_attach;
 	struct list_head tun_dev_open;
-	struct list_head skb_owned_by;
 #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	struct list_head xfrm_policy_alloc_security;
diff --git a/security/security.c b/security/security.c
index 3644b0344d29..554c3fb7d4a5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1848,7 +1848,6 @@ struct security_hook_heads security_hook_heads = {
 	.tun_dev_attach =
 		LIST_HEAD_INIT(security_hook_heads.tun_dev_attach),
 	.tun_dev_open =	LIST_HEAD_INIT(security_hook_heads.tun_dev_open),
-	.skb_owned_by =	LIST_HEAD_INIT(security_hook_heads.skb_owned_by),
 #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	.xfrm_policy_alloc_security =
-- 
cgit v1.2.3


From adae28c59a6a71a971ffed713550911546df0e20 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 23 Mar 2016 11:36:53 +0100
Subject: mfd: syscon: Include errno.h from header

The syscon header uses the ENOTSUPP error constant, but doesn't
include the header that defines it. This causes a build error
after the imx pinctrl driver started using syscon:

include/linux/mfd/syscon.h: In function 'syscon_node_to_regmap':
include/linux/mfd/syscon.h:32:18: error: 'ENOTSUPP' undeclared (first use in this function)
  return ERR_PTR(-ENOTSUPP);
                  ^~~~~~~~

This adds the missing #include.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 8626ada871f1 ("pinctrl: imx: attach iomuxc device to gpr syscon")
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/syscon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h
index 1088149be0c9..40a76b97b7ab 100644
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -16,6 +16,7 @@
 #define __LINUX_MFD_SYSCON_H__
 
 #include <linux/err.h>
+#include <linux/errno.h>
 
 struct device_node;
 
-- 
cgit v1.2.3


From a16d6ebca6efb73f6402f36e5aebf84f61721856 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Sun, 3 Apr 2016 20:44:45 +0200
Subject: i2c: introduce helper function to get 8 bit address from a message

Drivers do this in various ways, let's use one standard way of doing it.
Note: I2C_M_RD is bit 0, so the code could be simplified. To be extremly
robust and to advertise good coding practices, I still use the ternary
operator and let the compilers do the optimizing job.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 200cf13b00f6..c30833b7b073 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -654,6 +654,11 @@ static inline int i2c_adapter_id(struct i2c_adapter *adap)
 	return adap->nr;
 }
 
+static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg)
+{
+	return (msg->addr << 1) | (msg->flags & I2C_M_RD ? 1 : 0);
+}
+
 /**
  * module_i2c_driver() - Helper macro for registering a modular I2C driver
  * @__i2c_driver: i2c_driver struct
-- 
cgit v1.2.3


From f0af9593372abfde34460aa1250e670cc535a7d8 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 24 Feb 2016 13:43:45 -0600
Subject: PCI: Add pci_add_dma_alias() to abstract implementation

Add a pci_add_dma_alias() interface to encapsulate the details of adding an
alias.  No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/pci/pci.c    | 14 ++++++++++++++
 drivers/pci/quirks.c | 19 +++++++------------
 include/linux/pci.h  |  1 +
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..1162118d1093 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4578,6 +4578,20 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
 	return 0;
 }
 
+/**
+ * pci_add_dma_alias - Add a DMA devfn alias for a device
+ * @dev: the PCI device for which alias is added
+ * @devfn: alias slot and function
+ *
+ * This helper encodes 8-bit devfn as bit number in dma_alias_mask.
+ * It should be called early, preferably as PCI fixup header quirk.
+ */
+void pci_add_dma_alias(struct pci_dev *dev, u8 devfn)
+{
+	dev->dma_alias_devfn = devfn;
+	dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
+}
+
 bool pci_device_is_present(struct pci_dev *pdev)
 {
 	u32 v;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 8e678027b900..e45a7a8338bb 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3610,10 +3610,8 @@ int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 
 static void quirk_dma_func0_alias(struct pci_dev *dev)
 {
-	if (PCI_FUNC(dev->devfn) != 0) {
-		dev->dma_alias_devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
-		dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
-	}
+	if (PCI_FUNC(dev->devfn) != 0)
+		pci_add_dma_alias(dev, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
 }
 
 /*
@@ -3626,10 +3624,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_RICOH, 0xe476, quirk_dma_func0_alias);
 
 static void quirk_dma_func1_alias(struct pci_dev *dev)
 {
-	if (PCI_FUNC(dev->devfn) != 1) {
-		dev->dma_alias_devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 1);
-		dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
-	}
+	if (PCI_FUNC(dev->devfn) != 1)
+		pci_add_dma_alias(dev, PCI_DEVFN(PCI_SLOT(dev->devfn), 1));
 }
 
 /*
@@ -3696,11 +3692,10 @@ static void quirk_fixed_dma_alias(struct pci_dev *dev)
 
 	id = pci_match_id(fixed_dma_alias_tbl, dev);
 	if (id) {
-		dev->dma_alias_devfn = id->driver_data;
-		dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
+		pci_add_dma_alias(dev, id->driver_data);
 		dev_info(&dev->dev, "Enabling fixed DMA alias to %02x.%d\n",
-			 PCI_SLOT(dev->dma_alias_devfn),
-			 PCI_FUNC(dev->dma_alias_devfn));
+			 PCI_SLOT(id->driver_data),
+			 PCI_FUNC(id->driver_data));
 	}
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..7e7019064437 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1988,6 +1988,7 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 }
 #endif
 
+void pci_add_dma_alias(struct pci_dev *dev, u8 devfn);
 int pci_for_each_dma_alias(struct pci_dev *pdev,
 			   int (*fn)(struct pci_dev *pdev,
 				     u16 alias, void *data), void *data);
-- 
cgit v1.2.3


From 338c3149a221527e202ee26b1e35f76c965bb6c0 Mon Sep 17 00:00:00 2001
From: Jacek Lawrynowicz <jacek.lawrynowicz@intel.com>
Date: Thu, 3 Mar 2016 15:38:02 +0100
Subject: PCI: Add support for multiple DMA aliases

Solve IOMMU support issues with PCIe non-transparent bridges that use
Requester ID look-up tables (RID-LUT), e.g., the PEX8733.

The NTB connects devices in two independent PCI domains.  Devices separated
by the NTB are not able to discover each other.  A PCI packet being
forwared from one domain to another has to have its RID modified so it
appears on correct bus and completions are forwarded back to the original
domain through the NTB.  The RID is translated using a preprogrammed table
(LUT) and the PCI packet propagates upstream away from the NTB.  If the
destination system has IOMMU enabled, the packet will be discarded because
the new RID is unknown to the IOMMU.  Adding a DMA alias for the new RID
allows IOMMU to properly recognize the packet.

Each device behind the NTB has a unique RID assigned in the RID-LUT.  The
current DMA alias implementation supports only a single alias, so it's not
possible to support mutiple devices behind the NTB when IOMMU is enabled.

Enable all possible aliases on a given bus (256) that are stored in a
bitset.  Alias devfn is directly translated to a bit number.  The bitset is
not allocated for devices that have no need for DMA aliases.

More details can be found in the following article:
http://www.plxtech.com/files/pdf/technical/expresslane/RTC_Enabling%20MulitHostSystemDesigns.pdf

Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 10 +++-------
 drivers/pci/pci.c     | 19 +++++++++++++++++--
 drivers/pci/probe.c   |  1 +
 drivers/pci/search.c  | 14 +++++++++-----
 include/linux/pci.h   |  5 ++---
 5 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index bfd4f7c3b1d8..1b49e940a318 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -660,8 +660,8 @@ static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev,
 }
 
 /*
- * Look for aliases to or from the given device for exisiting groups.  The
- * dma_alias_devfn only supports aliases on the same bus, therefore the search
+ * Look for aliases to or from the given device for existing groups. DMA
+ * aliases are only supported on the same bus, therefore the search
  * space is quite small (especially since we're really only looking at pcie
  * device, and therefore only expect multiple slots on the root complex or
  * downstream switch ports).  It's conceivable though that a pair of
@@ -686,11 +686,7 @@ static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev,
 			continue;
 
 		/* We alias them or they alias us */
-		if (((pdev->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN) &&
-		     pdev->dma_alias_devfn == tmp->devfn) ||
-		    ((tmp->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN) &&
-		     tmp->dma_alias_devfn == pdev->devfn)) {
-
+		if (pci_devs_are_dma_aliases(pdev, tmp)) {
 			group = get_pci_alias_group(tmp, devfns);
 			if (group) {
 				pci_dev_put(tmp);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c82ebd0f6982..0b90c2186f1c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4588,12 +4588,27 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
  */
 void pci_add_dma_alias(struct pci_dev *dev, u8 devfn)
 {
-	dev->dma_alias_devfn = devfn;
-	dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
+	if (!dev->dma_alias_mask)
+		dev->dma_alias_mask = kcalloc(BITS_TO_LONGS(U8_MAX),
+					      sizeof(long), GFP_KERNEL);
+	if (!dev->dma_alias_mask) {
+		dev_warn(&dev->dev, "Unable to allocate DMA alias mask\n");
+		return;
+	}
+
+	set_bit(devfn, dev->dma_alias_mask);
 	dev_info(&dev->dev, "Enabling fixed DMA alias to %02x.%d\n",
 		 PCI_SLOT(devfn), PCI_FUNC(devfn));
 }
 
+bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2)
+{
+	return (dev1->dma_alias_mask &&
+		test_bit(dev2->devfn, dev1->dma_alias_mask)) ||
+	       (dev2->dma_alias_mask &&
+		test_bit(dev1->devfn, dev2->dma_alias_mask));
+}
+
 bool pci_device_is_present(struct pci_dev *pdev)
 {
 	u32 v;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8004f67c57ec..ae7daeb83e21 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1537,6 +1537,7 @@ static void pci_release_dev(struct device *dev)
 	pcibios_release_device(pci_dev);
 	pci_bus_put(pci_dev->bus);
 	kfree(pci_dev->driver_override);
+	kfree(pci_dev->dma_alias_mask);
 	kfree(pci_dev);
 }
 
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index a20ce7d5e2a7..33e0f033a48e 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -40,11 +40,15 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 	 * If the device is broken and uses an alias requester ID for
 	 * DMA, iterate over that too.
 	 */
-	if (unlikely(pdev->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN)) {
-		ret = fn(pdev, PCI_DEVID(pdev->bus->number,
-					 pdev->dma_alias_devfn), data);
-		if (ret)
-			return ret;
+	if (unlikely(pdev->dma_alias_mask)) {
+		u8 devfn;
+
+		for_each_set_bit(devfn, pdev->dma_alias_mask, U8_MAX) {
+			ret = fn(pdev, PCI_DEVID(pdev->bus->number, devfn),
+				 data);
+			if (ret)
+				return ret;
+		}
 	}
 
 	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7e7019064437..5581d05f7833 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -166,8 +166,6 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) (1 << 2),
 	/* Flag for quirk use to store if quirk-specific ACS is enabled */
 	PCI_DEV_FLAGS_ACS_ENABLED_QUIRK = (__force pci_dev_flags_t) (1 << 3),
-	/* Flag to indicate the device uses dma_alias_devfn */
-	PCI_DEV_FLAGS_DMA_ALIAS_DEVFN = (__force pci_dev_flags_t) (1 << 4),
 	/* Use a PCIe-to-PCI bridge alias even if !pci_is_pcie */
 	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
 	/* Do not use bus resets for device */
@@ -273,7 +271,7 @@ struct pci_dev {
 	u8		rom_base_reg;	/* which config register controls the ROM */
 	u8		pin;		/* which interrupt pin this device uses */
 	u16		pcie_flags_reg;	/* cached PCIe Capabilities Register */
-	u8		dma_alias_devfn;/* devfn of DMA alias, if any */
+	unsigned long	*dma_alias_mask;/* mask of enabled devfn aliases */
 
 	struct pci_driver *driver;	/* which driver has allocated this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
@@ -1989,6 +1987,7 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 #endif
 
 void pci_add_dma_alias(struct pci_dev *dev, u8 devfn);
+bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2);
 int pci_for_each_dma_alias(struct pci_dev *pdev,
 			   int (*fn)(struct pci_dev *pdev,
 				     u16 alias, void *data), void *data);
-- 
cgit v1.2.3


From 5ac7eace2d00eab5ae0e9fdee63e38aee6001f7c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Add a facility to restrict new links into a keyring

Add a facility whereby proposed new links to be added to a keyring can be
vetted, permitting them to be rejected if necessary.  This can be used to
block public keys from which the signature cannot be verified or for which
the signature verification fails.  It could also be used to provide
blacklisting.

This affects operations like add_key(), KEYCTL_LINK and KEYCTL_INSTANTIATE.

To this end:

 (1) A function pointer is added to the key struct that, if set, points to
     the vetting function.  This is called as:

	int (*restrict_link)(struct key *keyring,
			     const struct key_type *key_type,
			     unsigned long key_flags,
			     const union key_payload *key_payload),

     where 'keyring' will be the keyring being added to, key_type and
     key_payload will describe the key being added and key_flags[*] can be
     AND'ed with KEY_FLAG_TRUSTED.

     [*] This parameter will be removed in a later patch when
     	 KEY_FLAG_TRUSTED is removed.

     The function should return 0 to allow the link to take place or an
     error (typically -ENOKEY, -ENOPKG or -EKEYREJECTED) to reject the
     link.

     The pointer should not be set directly, but rather should be set
     through keyring_alloc().

     Note that if called during add_key(), preparse is called before this
     method, but a key isn't actually allocated until after this function
     is called.

 (2) KEY_ALLOC_BYPASS_RESTRICTION is added.  This can be passed to
     key_create_or_update() or key_instantiate_and_link() to bypass the
     restriction check.

 (3) KEY_FLAG_TRUSTED_ONLY is removed.  The entire contents of a keyring
     with this restriction emplaced can be considered 'trustworthy' by
     virtue of being in the keyring when that keyring is consulted.

 (4) key_alloc() and keyring_alloc() take an extra argument that will be
     used to set restrict_link in the new key.  This ensures that the
     pointer is set before the key is published, thus preventing a window
     of unrestrictedness.  Normally this argument will be NULL.

 (5) As a temporary affair, keyring_restrict_trusted_only() is added.  It
     should be passed to keyring_alloc() as the extra argument instead of
     setting KEY_FLAG_TRUSTED_ONLY on a keyring.  This will be replaced in
     a later patch with functions that look in the appropriate places for
     authoritative keys.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 Documentation/security/keys.txt  | 22 ++++++++++++
 certs/system_keyring.c           |  8 ++---
 fs/cifs/cifsacl.c                |  2 +-
 fs/nfs/nfs4idmap.c               |  2 +-
 include/linux/key.h              | 53 +++++++++++++++++++++++------
 net/dns_resolver/dns_key.c       |  2 +-
 net/rxrpc/ar-key.c               |  4 +--
 security/integrity/digsig.c      |  7 ++--
 security/integrity/ima/ima_mok.c |  8 ++---
 security/keys/key.c              | 43 +++++++++++++++++++----
 security/keys/keyring.c          | 73 ++++++++++++++++++++++++++++++++++++----
 security/keys/persistent.c       |  4 +--
 security/keys/process_keys.c     | 16 +++++----
 security/keys/request_key.c      |  4 +--
 security/keys/request_key_auth.c |  2 +-
 15 files changed, 198 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 8c183873b2b7..a6a50b359025 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -999,6 +999,10 @@ payload contents" for more information.
 	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				  const struct cred *cred,
 				  key_perm_t perm,
+				  int (*restrict_link)(struct key *,
+						       const struct key_type *,
+						       unsigned long,
+						       const union key_payload *),
 				  unsigned long flags,
 				  struct key *dest);
 
@@ -1010,6 +1014,24 @@ payload contents" for more information.
     KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted
     towards the user's quota).  Error ENOMEM can also be returned.
 
+    If restrict_link not NULL, it should point to a function that will be
+    called each time an attempt is made to link a key into the new keyring.
+    This function is called to check whether a key may be added into the keying
+    or not.  Callers of key_create_or_update() within the kernel can pass
+    KEY_ALLOC_BYPASS_RESTRICTION to suppress the check.  An example of using
+    this is to manage rings of cryptographic keys that are set up when the
+    kernel boots where userspace is also permitted to add keys - provided they
+    can be verified by a key the kernel already has.
+
+    When called, the restriction function will be passed the keyring being
+    added to, the key flags value and the type and payload of the key being
+    added.  Note that when a new key is being created, this is called between
+    payload preparsing and actual key creation.  The function should return 0
+    to allow the link or an error to reject it.
+
+    A convenience function, restrict_link_reject, exists to always return
+    -EPERM to in this case.
+
 
 (*) To check the validity of a key, this function can be called:
 
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index dc18869ff680..417d65882870 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -36,11 +36,10 @@ static __init int system_trusted_keyring_init(void)
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
 			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
-			      KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			      KEY_ALLOC_NOT_IN_QUOTA,
+			      keyring_restrict_trusted_only, NULL);
 	if (IS_ERR(system_trusted_keyring))
 		panic("Can't allocate system trusted keyring\n");
-
-	set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
 	return 0;
 }
 
@@ -85,7 +84,8 @@ static __init int load_system_certificate_list(void)
 					   KEY_USR_VIEW | KEY_USR_READ),
 					   KEY_ALLOC_NOT_IN_QUOTA |
 					   KEY_ALLOC_TRUSTED |
-					   KEY_ALLOC_BUILT_IN);
+					   KEY_ALLOC_BUILT_IN |
+					   KEY_ALLOC_BYPASS_RESTRICTION);
 		if (IS_ERR(key)) {
 			pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
 			       PTR_ERR(key));
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 3f93125916bf..71e8a56e9479 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -360,7 +360,7 @@ init_cifs_idmap(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 5ba22c6b0ffa..c444285bb1b1 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -201,7 +201,7 @@ int nfs_idmap_init(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/include/linux/key.h b/include/linux/key.h
index 5f5b1129dc92..83b603639d2e 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -174,10 +174,9 @@ struct key {
 #define KEY_FLAG_ROOT_CAN_CLEAR	6	/* set if key can be cleared by root without permission */
 #define KEY_FLAG_INVALIDATED	7	/* set if key has been invalidated */
 #define KEY_FLAG_TRUSTED	8	/* set if key is trusted */
-#define KEY_FLAG_TRUSTED_ONLY	9	/* set if keyring only accepts links to trusted keys */
-#define KEY_FLAG_BUILTIN	10	/* set if key is builtin */
-#define KEY_FLAG_ROOT_CAN_INVAL	11	/* set if key can be invalidated by root without permission */
-#define KEY_FLAG_KEEP		12	/* set if key should not be removed */
+#define KEY_FLAG_BUILTIN	9	/* set if key is built in to the kernel */
+#define KEY_FLAG_ROOT_CAN_INVAL	10	/* set if key can be invalidated by root without permission */
+#define KEY_FLAG_KEEP		11	/* set if key should not be removed */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -205,6 +204,21 @@ struct key {
 		};
 		int reject_error;
 	};
+
+	/* This is set on a keyring to restrict the addition of a link to a key
+	 * to it.  If this method isn't provided then it is assumed that the
+	 * keyring is open to any addition.  It is ignored for non-keyring
+	 * keys.
+	 *
+	 * This is intended for use with rings of trusted keys whereby addition
+	 * to the keyring needs to be controlled.  KEY_ALLOC_BYPASS_RESTRICTION
+	 * overrides this, allowing the kernel to add extra keys without
+	 * restriction.
+	 */
+	int (*restrict_link)(struct key *keyring,
+			     const struct key_type *type,
+			     unsigned long flags,
+			     const union key_payload *payload);
 };
 
 extern struct key *key_alloc(struct key_type *type,
@@ -212,14 +226,19 @@ extern struct key *key_alloc(struct key_type *type,
 			     kuid_t uid, kgid_t gid,
 			     const struct cred *cred,
 			     key_perm_t perm,
-			     unsigned long flags);
+			     unsigned long flags,
+			     int (*restrict_link)(struct key *,
+						  const struct key_type *,
+						  unsigned long,
+						  const union key_payload *));
 
 
-#define KEY_ALLOC_IN_QUOTA	0x0000	/* add to quota, reject if would overrun */
-#define KEY_ALLOC_QUOTA_OVERRUN	0x0001	/* add to quota, permit even if overrun */
-#define KEY_ALLOC_NOT_IN_QUOTA	0x0002	/* not in quota */
-#define KEY_ALLOC_TRUSTED	0x0004	/* Key should be flagged as trusted */
-#define KEY_ALLOC_BUILT_IN	0x0008	/* Key is built into kernel */
+#define KEY_ALLOC_IN_QUOTA		0x0000	/* add to quota, reject if would overrun */
+#define KEY_ALLOC_QUOTA_OVERRUN		0x0001	/* add to quota, permit even if overrun */
+#define KEY_ALLOC_NOT_IN_QUOTA		0x0002	/* not in quota */
+#define KEY_ALLOC_TRUSTED		0x0004	/* Key should be flagged as trusted */
+#define KEY_ALLOC_BUILT_IN		0x0008	/* Key is built into kernel */
+#define KEY_ALLOC_BYPASS_RESTRICTION	0x0010	/* Override the check on restricted keyrings */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
@@ -288,8 +307,22 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 				 const struct cred *cred,
 				 key_perm_t perm,
 				 unsigned long flags,
+				 int (*restrict_link)(struct key *,
+						      const struct key_type *,
+						      unsigned long,
+						      const union key_payload *),
 				 struct key *dest);
 
+extern int keyring_restrict_trusted_only(struct key *keyring,
+					 const struct key_type *type,
+					 unsigned long,
+					 const union key_payload *payload);
+
+extern int restrict_link_reject(struct key *keyring,
+				const struct key_type *type,
+				unsigned long flags,
+				const union key_payload *payload);
+
 extern int keyring_clear(struct key *keyring);
 
 extern key_ref_t keyring_search(key_ref_t keyring,
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index c79b85eb4d4c..8737412c7b27 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -281,7 +281,7 @@ static int __init init_dns_resolver(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 3fb492eedeb9..1021b4c0bdd2 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -965,7 +965,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 
 	key = key_alloc(&key_type_rxrpc, "x",
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
-			KEY_ALLOC_NOT_IN_QUOTA);
+			KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
 		return -ENOMEM;
@@ -1012,7 +1012,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
 
 	key = key_alloc(&key_type_rxrpc, keyname,
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key))
 		return key;
 
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index 8ef15118cc78..659566c2200b 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -83,10 +83,9 @@ int __init integrity_init_keyring(const unsigned int id)
 				    ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				     KEY_USR_VIEW | KEY_USR_READ |
 				     KEY_USR_WRITE | KEY_USR_SEARCH),
-				    KEY_ALLOC_NOT_IN_QUOTA, NULL);
-	if (!IS_ERR(keyring[id]))
-		set_bit(KEY_FLAG_TRUSTED_ONLY, &keyring[id]->flags);
-	else {
+				    KEY_ALLOC_NOT_IN_QUOTA,
+				    NULL, NULL);
+	if (IS_ERR(keyring[id])) {
 		err = PTR_ERR(keyring[id]);
 		pr_info("Can't allocate %s keyring (%d)\n",
 			keyring_name[id], err);
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index 676885e4320e..ef91248cb934 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -35,20 +35,20 @@ __init int ima_mok_init(void)
 			      (KEY_POS_ALL & ~KEY_POS_SETATTR) |
 			      KEY_USR_VIEW | KEY_USR_READ |
 			      KEY_USR_WRITE | KEY_USR_SEARCH,
-			      KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			      KEY_ALLOC_NOT_IN_QUOTA,
+			      keyring_restrict_trusted_only, NULL);
 
 	ima_blacklist_keyring = keyring_alloc(".ima_blacklist",
 				KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ |
 				KEY_USR_WRITE | KEY_USR_SEARCH,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA,
+				keyring_restrict_trusted_only, NULL);
 
 	if (IS_ERR(ima_mok_keyring) || IS_ERR(ima_blacklist_keyring))
 		panic("Can't allocate IMA MOK or blacklist keyrings.");
-	set_bit(KEY_FLAG_TRUSTED_ONLY, &ima_mok_keyring->flags);
 
-	set_bit(KEY_FLAG_TRUSTED_ONLY, &ima_blacklist_keyring->flags);
 	set_bit(KEY_FLAG_KEEP, &ima_blacklist_keyring->flags);
 	return 0;
 }
diff --git a/security/keys/key.c b/security/keys/key.c
index b28755131687..deb881754e03 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -201,6 +201,7 @@ serial_exists:
  * @cred: The credentials specifying UID namespace.
  * @perm: The permissions mask of the new key.
  * @flags: Flags specifying quota properties.
+ * @restrict_link: Optional link restriction method for new keyrings.
  *
  * Allocate a key of the specified type with the attributes given.  The key is
  * returned in an uninstantiated state and the caller needs to instantiate the
@@ -223,7 +224,11 @@ serial_exists:
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      kuid_t uid, kgid_t gid, const struct cred *cred,
-		      key_perm_t perm, unsigned long flags)
+		      key_perm_t perm, unsigned long flags,
+		      int (*restrict_link)(struct key *,
+					   const struct key_type *,
+					   unsigned long,
+					   const union key_payload *))
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -291,6 +296,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->uid = uid;
 	key->gid = gid;
 	key->perm = perm;
+	key->restrict_link = restrict_link;
 
 	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
 		key->flags |= 1 << KEY_FLAG_IN_QUOTA;
@@ -496,6 +502,12 @@ int key_instantiate_and_link(struct key *key,
 	}
 
 	if (keyring) {
+		if (keyring->restrict_link) {
+			ret = keyring->restrict_link(keyring, key->type,
+						     key->flags, &prep.payload);
+			if (ret < 0)
+				goto error;
+		}
 		ret = __key_link_begin(keyring, &key->index_key, &edit);
 		if (ret < 0)
 			goto error;
@@ -551,8 +563,12 @@ int key_reject_and_link(struct key *key,
 	awaken = 0;
 	ret = -EBUSY;
 
-	if (keyring)
+	if (keyring) {
+		if (keyring->restrict_link)
+			return -EPERM;
+
 		link_ret = __key_link_begin(keyring, &key->index_key, &edit);
+	}
 
 	mutex_lock(&key_construction_mutex);
 
@@ -793,6 +809,10 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
 	int ret;
+	int (*restrict_link)(struct key *,
+			     const struct key_type *,
+			     unsigned long,
+			     const union key_payload *) = NULL;
 
 	/* look up the key type to see if it's one of the registered kernel
 	 * types */
@@ -811,6 +831,10 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	key_check(keyring);
 
+	key_ref = ERR_PTR(-EPERM);
+	if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION))
+		restrict_link = keyring->restrict_link;
+
 	key_ref = ERR_PTR(-ENOTDIR);
 	if (keyring->type != &key_type_keyring)
 		goto error_put_type;
@@ -835,10 +859,15 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 	index_key.desc_len = strlen(index_key.description);
 
-	key_ref = ERR_PTR(-EPERM);
-	if (!prep.trusted && test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags))
-		goto error_free_prep;
-	flags |= prep.trusted ? KEY_ALLOC_TRUSTED : 0;
+	if (restrict_link) {
+		unsigned long kflags = prep.trusted ? KEY_FLAG_TRUSTED : 0;
+		ret = restrict_link(keyring,
+				    index_key.type, kflags, &prep.payload);
+		if (ret < 0) {
+			key_ref = ERR_PTR(ret);
+			goto error_free_prep;
+		}
+	}
 
 	ret = __key_link_begin(keyring, &index_key, &edit);
 	if (ret < 0) {
@@ -879,7 +908,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	/* allocate a new key */
 	key = key_alloc(index_key.type, index_key.description,
-			cred->fsuid, cred->fsgid, cred, perm, flags);
+			cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_link_end;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index f931ccfeefb0..d2d1f3378008 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -491,13 +491,18 @@ static long keyring_read(const struct key *keyring,
  */
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 			  const struct cred *cred, key_perm_t perm,
-			  unsigned long flags, struct key *dest)
+			  unsigned long flags,
+			  int (*restrict_link)(struct key *,
+					       const struct key_type *,
+					       unsigned long,
+					       const union key_payload *),
+			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, cred, perm, flags);
+			    uid, gid, cred, perm, flags, restrict_link);
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
@@ -510,6 +515,51 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 }
 EXPORT_SYMBOL(keyring_alloc);
 
+/**
+ * keyring_restrict_trusted_only - Restrict additions to a keyring to trusted keys only
+ * @keyring: The keyring being added to.
+ * @type: The type of key being added.
+ * @flags: The key flags.
+ * @payload: The payload of the key intended to be added.
+ *
+ * Reject the addition of any links to a keyring that point to keys that aren't
+ * marked as being trusted.  It can be overridden by passing
+ * KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when adding a key
+ * to a keyring.
+ *
+ * This is meant to be passed as the restrict_link parameter to
+ * keyring_alloc().
+ */
+int keyring_restrict_trusted_only(struct key *keyring,
+				  const struct key_type *type,
+				  unsigned long flags,
+				  const union key_payload *payload)
+{
+	return flags & KEY_FLAG_TRUSTED ? 0 : -EPERM;
+}
+
+/**
+ * restrict_link_reject - Give -EPERM to restrict link
+ * @keyring: The keyring being added to.
+ * @type: The type of key being added.
+ * @flags: The key flags.
+ * @payload: The payload of the key intended to be added.
+ *
+ * Reject the addition of any links to a keyring.  It can be overridden by
+ * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when
+ * adding a key to a keyring.
+ *
+ * This is meant to be passed as the restrict_link parameter to
+ * keyring_alloc().
+ */
+int restrict_link_reject(struct key *keyring,
+			 const struct key_type *type,
+			 unsigned long flags,
+			 const union key_payload *payload)
+{
+	return -EPERM;
+}
+
 /*
  * By default, we keys found by getting an exact match on their descriptions.
  */
@@ -1191,6 +1241,17 @@ void __key_link_end(struct key *keyring,
 	up_write(&keyring->sem);
 }
 
+/*
+ * Check addition of keys to restricted keyrings.
+ */
+static int __key_link_check_restriction(struct key *keyring, struct key *key)
+{
+	if (!keyring->restrict_link)
+		return 0;
+	return keyring->restrict_link(keyring,
+				      key->type, key->flags, &key->payload);
+}
+
 /**
  * key_link - Link a key to a keyring
  * @keyring: The keyring to make the link in.
@@ -1221,14 +1282,12 @@ int key_link(struct key *keyring, struct key *key)
 	key_check(keyring);
 	key_check(key);
 
-	if (test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags) &&
-	    !test_bit(KEY_FLAG_TRUSTED, &key->flags))
-		return -EPERM;
-
 	ret = __key_link_begin(keyring, &key->index_key, &edit);
 	if (ret == 0) {
 		kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage));
-		ret = __key_link_check_live_key(keyring, key);
+		ret = __key_link_check_restriction(keyring, key);
+		if (ret == 0)
+			ret = __key_link_check_live_key(keyring, key);
 		if (ret == 0)
 			__key_link(key, &edit);
 		__key_link_end(keyring, &key->index_key, edit);
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index c9fae5ea89fe..2ef45b319dd9 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -26,7 +26,7 @@ static int key_create_persistent_register(struct user_namespace *ns)
 					current_cred(),
 					((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 					 KEY_USR_VIEW | KEY_USR_READ),
-					KEY_ALLOC_NOT_IN_QUOTA, NULL);
+					KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
 
@@ -60,7 +60,7 @@ static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid,
 				   uid, INVALID_GID, current_cred(),
 				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				    KEY_USR_VIEW | KEY_USR_READ),
-				   KEY_ALLOC_NOT_IN_QUOTA,
+				   KEY_ALLOC_NOT_IN_QUOTA, NULL,
 				   ns->persistent_keyring_register);
 	if (IS_ERR(persistent))
 		return ERR_CAST(persistent);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index e6d50172872f..40a885239782 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -76,7 +76,8 @@ int install_user_keyrings(void)
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID,
 						    cred, user_keyring_perm,
-						    KEY_ALLOC_IN_QUOTA, NULL);
+						    KEY_ALLOC_IN_QUOTA,
+						    NULL, NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
 				goto error;
@@ -92,7 +93,8 @@ int install_user_keyrings(void)
 			session_keyring =
 				keyring_alloc(buf, user->uid, INVALID_GID,
 					      cred, user_keyring_perm,
-					      KEY_ALLOC_IN_QUOTA, NULL);
+					      KEY_ALLOC_IN_QUOTA,
+					      NULL, NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -134,7 +136,8 @@ int install_thread_keyring_to_cred(struct cred *new)
 
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
 				KEY_POS_ALL | KEY_USR_VIEW,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+				KEY_ALLOC_QUOTA_OVERRUN,
+				NULL, NULL);
 	if (IS_ERR(keyring))
 		return PTR_ERR(keyring);
 
@@ -180,7 +183,8 @@ int install_process_keyring_to_cred(struct cred *new)
 
 	keyring = keyring_alloc("_pid", new->uid, new->gid, new,
 				KEY_POS_ALL | KEY_USR_VIEW,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+				KEY_ALLOC_QUOTA_OVERRUN,
+				NULL, NULL);
 	if (IS_ERR(keyring))
 		return PTR_ERR(keyring);
 
@@ -231,7 +235,7 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 
 		keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
 					KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-					flags, NULL);
+					flags, NULL, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	} else {
@@ -785,7 +789,7 @@ long join_session_keyring(const char *name)
 		keyring = keyring_alloc(
 			name, old->uid, old->gid, old,
 			KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
-			KEY_ALLOC_IN_QUOTA, NULL);
+			KEY_ALLOC_IN_QUOTA, NULL, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index c7a117c9a8f3..a29e3554751e 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -116,7 +116,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 	cred = get_current_cred();
 	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
 				KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+				KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
 	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -355,7 +355,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx,
 
 	key = key_alloc(ctx->index_key.type, ctx->index_key.description,
 			ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred,
-			perm, flags);
+			perm, flags, NULL);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 4f0f112fe276..9db8b4a82787 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -202,7 +202,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	authkey = key_alloc(&key_type_request_key_auth, desc,
 			    cred->fsuid, cred->fsgid, cred,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
-			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA);
+			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(authkey)) {
 		ret = PTR_ERR(authkey);
 		goto error_alloc;
-- 
cgit v1.2.3


From 77f68bac9481ad440f4f34dda3d28c2dce6eb87b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:26 +0100
Subject: KEYS: Remove KEY_FLAG_TRUSTED and KEY_ALLOC_TRUSTED

Remove KEY_FLAG_TRUSTED and KEY_ALLOC_TRUSTED as they're no longer
meaningful.  Also we can drop the trusted flag from the preparse structure.

Given this, we no longer need to pass the key flags through to
restrict_link().

Further, we can now get rid of keyring_restrict_trusted_only() also.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 certs/system_keyring.c        |  2 --
 include/keys/system_keyring.h |  1 -
 include/linux/key-type.h      |  1 -
 include/linux/key.h           | 21 +++++----------------
 security/integrity/digsig.c   |  3 +--
 security/keys/key.c           | 11 ++---------
 security/keys/keyring.c       | 29 +----------------------------
 7 files changed, 9 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 4e2fa8ab01d6..e460d00a7781 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -31,7 +31,6 @@ extern __initconst const unsigned long system_certificate_list_size;
  */
 int restrict_link_by_builtin_trusted(struct key *keyring,
 				     const struct key_type *type,
-				     unsigned long flags,
 				     const union key_payload *payload)
 {
 	return restrict_link_by_signature(system_trusted_keyring,
@@ -97,7 +96,6 @@ static __init int load_system_certificate_list(void)
 					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 					   KEY_USR_VIEW | KEY_USR_READ),
 					   KEY_ALLOC_NOT_IN_QUOTA |
-					   KEY_ALLOC_TRUSTED |
 					   KEY_ALLOC_BUILT_IN |
 					   KEY_ALLOC_BYPASS_RESTRICTION);
 		if (IS_ERR(key)) {
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 93715913a0b1..c72330ae76df 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -18,7 +18,6 @@
 
 extern int restrict_link_by_builtin_trusted(struct key *keyring,
 					    const struct key_type *type,
-					    unsigned long flags,
 					    const union key_payload *payload);
 
 #else
diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 7463355a198b..eaee981c5558 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -45,7 +45,6 @@ struct key_preparsed_payload {
 	size_t		datalen;	/* Raw datalen */
 	size_t		quotalen;	/* Quota length for proposed payload */
 	time_t		expiry;		/* Expiry time of key */
-	bool		trusted;	/* True if key is trusted */
 };
 
 typedef int (*request_key_actor_t)(struct key_construction *key,
diff --git a/include/linux/key.h b/include/linux/key.h
index 83b603639d2e..722914798f37 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -173,10 +173,9 @@ struct key {
 #define KEY_FLAG_NEGATIVE	5	/* set if key is negative */
 #define KEY_FLAG_ROOT_CAN_CLEAR	6	/* set if key can be cleared by root without permission */
 #define KEY_FLAG_INVALIDATED	7	/* set if key has been invalidated */
-#define KEY_FLAG_TRUSTED	8	/* set if key is trusted */
-#define KEY_FLAG_BUILTIN	9	/* set if key is built in to the kernel */
-#define KEY_FLAG_ROOT_CAN_INVAL	10	/* set if key can be invalidated by root without permission */
-#define KEY_FLAG_KEEP		11	/* set if key should not be removed */
+#define KEY_FLAG_BUILTIN	8	/* set if key is built in to the kernel */
+#define KEY_FLAG_ROOT_CAN_INVAL	9	/* set if key can be invalidated by root without permission */
+#define KEY_FLAG_KEEP		10	/* set if key should not be removed */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -217,7 +216,6 @@ struct key {
 	 */
 	int (*restrict_link)(struct key *keyring,
 			     const struct key_type *type,
-			     unsigned long flags,
 			     const union key_payload *payload);
 };
 
@@ -229,16 +227,14 @@ extern struct key *key_alloc(struct key_type *type,
 			     unsigned long flags,
 			     int (*restrict_link)(struct key *,
 						  const struct key_type *,
-						  unsigned long,
 						  const union key_payload *));
 
 
 #define KEY_ALLOC_IN_QUOTA		0x0000	/* add to quota, reject if would overrun */
 #define KEY_ALLOC_QUOTA_OVERRUN		0x0001	/* add to quota, permit even if overrun */
 #define KEY_ALLOC_NOT_IN_QUOTA		0x0002	/* not in quota */
-#define KEY_ALLOC_TRUSTED		0x0004	/* Key should be flagged as trusted */
-#define KEY_ALLOC_BUILT_IN		0x0008	/* Key is built into kernel */
-#define KEY_ALLOC_BYPASS_RESTRICTION	0x0010	/* Override the check on restricted keyrings */
+#define KEY_ALLOC_BUILT_IN		0x0004	/* Key is built into kernel */
+#define KEY_ALLOC_BYPASS_RESTRICTION	0x0008	/* Override the check on restricted keyrings */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
@@ -309,18 +305,11 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 				 unsigned long flags,
 				 int (*restrict_link)(struct key *,
 						      const struct key_type *,
-						      unsigned long,
 						      const union key_payload *),
 				 struct key *dest);
 
-extern int keyring_restrict_trusted_only(struct key *keyring,
-					 const struct key_type *type,
-					 unsigned long,
-					 const union key_payload *payload);
-
 extern int restrict_link_reject(struct key *keyring,
 				const struct key_type *type,
-				unsigned long flags,
 				const union key_payload *payload);
 
 extern int keyring_clear(struct key *keyring);
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index d647178c6bbd..98ee4c752cf5 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -51,12 +51,11 @@ static bool init_keyring __initdata;
  */
 static int restrict_link_by_ima_mok(struct key *keyring,
 				    const struct key_type *type,
-				    unsigned long flags,
 				    const union key_payload *payload)
 {
 	int ret;
 
-	ret = restrict_link_by_builtin_trusted(keyring, type, flags, payload);
+	ret = restrict_link_by_builtin_trusted(keyring, type, payload);
 	if (ret != -ENOKEY)
 		return ret;
 
diff --git a/security/keys/key.c b/security/keys/key.c
index deb881754e03..bd5a272f28a6 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -227,7 +227,6 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 		      key_perm_t perm, unsigned long flags,
 		      int (*restrict_link)(struct key *,
 					   const struct key_type *,
-					   unsigned long,
 					   const union key_payload *))
 {
 	struct key_user *user = NULL;
@@ -300,8 +299,6 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 
 	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
 		key->flags |= 1 << KEY_FLAG_IN_QUOTA;
-	if (flags & KEY_ALLOC_TRUSTED)
-		key->flags |= 1 << KEY_FLAG_TRUSTED;
 	if (flags & KEY_ALLOC_BUILT_IN)
 		key->flags |= 1 << KEY_FLAG_BUILTIN;
 
@@ -504,7 +501,7 @@ int key_instantiate_and_link(struct key *key,
 	if (keyring) {
 		if (keyring->restrict_link) {
 			ret = keyring->restrict_link(keyring, key->type,
-						     key->flags, &prep.payload);
+						     &prep.payload);
 			if (ret < 0)
 				goto error;
 		}
@@ -811,7 +808,6 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	int ret;
 	int (*restrict_link)(struct key *,
 			     const struct key_type *,
-			     unsigned long,
 			     const union key_payload *) = NULL;
 
 	/* look up the key type to see if it's one of the registered kernel
@@ -843,7 +839,6 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	prep.data = payload;
 	prep.datalen = plen;
 	prep.quotalen = index_key.type->def_datalen;
-	prep.trusted = flags & KEY_ALLOC_TRUSTED;
 	prep.expiry = TIME_T_MAX;
 	if (index_key.type->preparse) {
 		ret = index_key.type->preparse(&prep);
@@ -860,9 +855,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	index_key.desc_len = strlen(index_key.description);
 
 	if (restrict_link) {
-		unsigned long kflags = prep.trusted ? KEY_FLAG_TRUSTED : 0;
-		ret = restrict_link(keyring,
-				    index_key.type, kflags, &prep.payload);
+		ret = restrict_link(keyring, index_key.type, &prep.payload);
 		if (ret < 0) {
 			key_ref = ERR_PTR(ret);
 			goto error_free_prep;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index d2d1f3378008..c91e4e0cea08 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -494,7 +494,6 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 			  unsigned long flags,
 			  int (*restrict_link)(struct key *,
 					       const struct key_type *,
-					       unsigned long,
 					       const union key_payload *),
 			  struct key *dest)
 {
@@ -515,34 +514,10 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 }
 EXPORT_SYMBOL(keyring_alloc);
 
-/**
- * keyring_restrict_trusted_only - Restrict additions to a keyring to trusted keys only
- * @keyring: The keyring being added to.
- * @type: The type of key being added.
- * @flags: The key flags.
- * @payload: The payload of the key intended to be added.
- *
- * Reject the addition of any links to a keyring that point to keys that aren't
- * marked as being trusted.  It can be overridden by passing
- * KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when adding a key
- * to a keyring.
- *
- * This is meant to be passed as the restrict_link parameter to
- * keyring_alloc().
- */
-int keyring_restrict_trusted_only(struct key *keyring,
-				  const struct key_type *type,
-				  unsigned long flags,
-				  const union key_payload *payload)
-{
-	return flags & KEY_FLAG_TRUSTED ? 0 : -EPERM;
-}
-
 /**
  * restrict_link_reject - Give -EPERM to restrict link
  * @keyring: The keyring being added to.
  * @type: The type of key being added.
- * @flags: The key flags.
  * @payload: The payload of the key intended to be added.
  *
  * Reject the addition of any links to a keyring.  It can be overridden by
@@ -554,7 +529,6 @@ int keyring_restrict_trusted_only(struct key *keyring,
  */
 int restrict_link_reject(struct key *keyring,
 			 const struct key_type *type,
-			 unsigned long flags,
 			 const union key_payload *payload)
 {
 	return -EPERM;
@@ -1248,8 +1222,7 @@ static int __key_link_check_restriction(struct key *keyring, struct key *key)
 {
 	if (!keyring->restrict_link)
 		return 0;
-	return keyring->restrict_link(keyring,
-				      key->type, key->flags, &key->payload);
+	return keyring->restrict_link(keyring, key->type, &key->payload);
 }
 
 /**
-- 
cgit v1.2.3


From 757d12e5849be549076901b0d33c60d5f360269c Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Tue, 12 Apr 2016 21:07:06 +0530
Subject: dmaengine: ensure dmaengine helpers check valid callback

dmaengine has various device callbacks and exposes helper
functions to invoke these. These helpers should check if channel,
device and callback is valid or not before invoking them.

Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 017433712833..30de0197263a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -804,6 +804,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single(
 	sg_dma_address(&sg) = buf;
 	sg_dma_len(&sg) = len;
 
+	if (!chan || !chan->device || !chan->device->device_prep_slave_sg)
+		return NULL;
+
 	return chan->device->device_prep_slave_sg(chan, &sg, 1,
 						  dir, flags, NULL);
 }
@@ -812,6 +815,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_sg(
 	struct dma_chan *chan, struct scatterlist *sgl,	unsigned int sg_len,
 	enum dma_transfer_direction dir, unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_slave_sg)
+		return NULL;
+
 	return chan->device->device_prep_slave_sg(chan, sgl, sg_len,
 						  dir, flags, NULL);
 }
@@ -823,6 +829,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_rio_sg(
 	enum dma_transfer_direction dir, unsigned long flags,
 	struct rio_dma_ext *rio_ext)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_slave_sg)
+		return NULL;
+
 	return chan->device->device_prep_slave_sg(chan, sgl, sg_len,
 						  dir, flags, rio_ext);
 }
@@ -833,6 +842,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
 		size_t period_len, enum dma_transfer_direction dir,
 		unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_dma_cyclic)
+		return NULL;
+
 	return chan->device->device_prep_dma_cyclic(chan, buf_addr, buf_len,
 						period_len, dir, flags);
 }
@@ -841,6 +853,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
 		struct dma_chan *chan, struct dma_interleaved_template *xt,
 		unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_interleaved_dma)
+		return NULL;
+
 	return chan->device->device_prep_interleaved_dma(chan, xt, flags);
 }
 
@@ -848,7 +863,7 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags)
 {
-	if (!chan || !chan->device)
+	if (!chan || !chan->device || !chan->device->device_prep_dma_memset)
 		return NULL;
 
 	return chan->device->device_prep_dma_memset(chan, dest, value,
@@ -861,6 +876,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 		struct scatterlist *src_sg, unsigned int src_nents,
 		unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_dma_sg)
+		return NULL;
+
 	return chan->device->device_prep_dma_sg(chan, dst_sg, dst_nents,
 			src_sg, src_nents, flags);
 }
-- 
cgit v1.2.3


From 9fd4dcece43a53e5a9e65a973df5693702ee6401 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 22 Mar 2016 14:11:13 +0100
Subject: debugfs: prevent access to possibly dead file_operations at file open

Nothing prevents a dentry found by path lookup before a return of
__debugfs_remove() to actually get opened after that return. Now, after
the return of __debugfs_remove(), there are no guarantees whatsoever
regarding the memory the corresponding inode's file_operations object
had been kept in.

Since __debugfs_remove() is seldomly invoked, usually from module exit
handlers only, the race is hard to trigger and the impact is very low.

A discussion of the problem outlined above as well as a suggested
solution can be found in the (sub-)thread rooted at

  http://lkml.kernel.org/g/20130401203445.GA20862@ZenIV.linux.org.uk
  ("Yet another pipe related oops.")

Basically, Greg KH suggests to introduce an intermediate fops and
Al Viro points out that a pointer to the original ones may be stored in
->d_fsdata.

Follow this line of reasoning:
- Add SRCU as a reverse dependency of DEBUG_FS.
- Introduce a srcu_struct object for the debugfs subsystem.
- In debugfs_create_file(), store a pointer to the original
  file_operations object in ->d_fsdata.
- Make debugfs_remove() and debugfs_remove_recursive() wait for a
  SRCU grace period after the dentry has been delete()'d and before they
  return to their callers.
- Introduce an intermediate file_operations object named
  "debugfs_open_proxy_file_operations". It's ->open() functions checks,
  under the protection of a SRCU read lock, whether the dentry is still
  alive, i.e. has not been d_delete()'d and if so, tries to acquire a
  reference on the owning module.
  On success, it sets the file object's ->f_op to the original
  file_operations and forwards the ongoing open() call to the original
  ->open().
- For clarity, rename the former debugfs_file_operations to
  debugfs_noop_file_operations -- they are in no way canonical.

The choice of SRCU over "normal" RCU is justified by the fact, that the
former may also be used to protect ->i_private data from going away
during the execution of a file's readers and writers which may (and do)
sleep.

Finally, introduce the fs/debugfs/internal.h header containing some
declarations internal to the debugfs implementation.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 91 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/debugfs/inode.c      | 13 ++++++-
 fs/debugfs/internal.h   | 24 +++++++++++++
 include/linux/debugfs.h |  3 --
 lib/Kconfig.debug       |  1 +
 5 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 fs/debugfs/internal.h

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d2ba12e23ed9..736ab3c988f2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -22,6 +22,9 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
+#include <linux/srcu.h>
+
+#include "internal.h"
 
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
@@ -35,13 +38,99 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 	return count;
 }
 
-const struct file_operations debugfs_file_operations = {
+const struct file_operations debugfs_noop_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		simple_open,
 	.llseek =	noop_llseek,
 };
 
+/**
+ * debugfs_use_file_start - mark the beginning of file data access
+ * @dentry: the dentry object whose data is being accessed.
+ * @srcu_idx: a pointer to some memory to store a SRCU index in.
+ *
+ * Up to a matching call to debugfs_use_file_finish(), any
+ * successive call into the file removing functions debugfs_remove()
+ * and debugfs_remove_recursive() will block. Since associated private
+ * file data may only get freed after a successful return of any of
+ * the removal functions, you may safely access it after a successful
+ * call to debugfs_use_file_start() without worrying about
+ * lifetime issues.
+ *
+ * If -%EIO is returned, the file has already been removed and thus,
+ * it is not safe to access any of its data. If, on the other hand,
+ * it is allowed to access the file data, zero is returned.
+ *
+ * Regardless of the return code, any call to
+ * debugfs_use_file_start() must be followed by a matching call
+ * to debugfs_use_file_finish().
+ */
+static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+	__acquires(&debugfs_srcu)
+{
+	*srcu_idx = srcu_read_lock(&debugfs_srcu);
+	barrier();
+	if (d_unlinked(dentry))
+		return -EIO;
+	return 0;
+}
+
+/**
+ * debugfs_use_file_finish - mark the end of file data access
+ * @srcu_idx: the SRCU index "created" by a former call to
+ *            debugfs_use_file_start().
+ *
+ * Allow any ongoing concurrent call into debugfs_remove() or
+ * debugfs_remove_recursive() blocked by a former call to
+ * debugfs_use_file_start() to proceed and return to its caller.
+ */
+static void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+{
+	srcu_read_unlock(&debugfs_srcu, srcu_idx);
+}
+
+#define F_DENTRY(filp) ((filp)->f_path.dentry)
+
+#define REAL_FOPS_DEREF(dentry)					\
+	((const struct file_operations *)(dentry)->d_fsdata)
+
+static int open_proxy_open(struct inode *inode, struct file *filp)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = NULL;
+	int srcu_idx, r;
+
+	r = debugfs_use_file_start(dentry, &srcu_idx);
+	if (r) {
+		r = -ENOENT;
+		goto out;
+	}
+
+	real_fops = REAL_FOPS_DEREF(dentry);
+	real_fops = fops_get(real_fops);
+	if (!real_fops) {
+		/* Huh? Module did not clean up after itself at exit? */
+		WARN(1, "debugfs file owner did not clean up at exit: %pd",
+			dentry);
+		r = -ENXIO;
+		goto out;
+	}
+	replace_fops(filp, real_fops);
+
+	if (real_fops->open)
+		r = real_fops->open(inode, filp);
+
+out:
+	fops_put(real_fops);
+	debugfs_use_file_finish(srcu_idx);
+	return r;
+}
+
+const struct file_operations debugfs_open_proxy_file_operations = {
+	.open = open_proxy_open,
+};
+
 static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
 					  struct dentry *parent, void *value,
 				          const struct file_operations *fops,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b1e7f35f3cd4..2905dd160575 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,9 +27,14 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/srcu.h>
+
+#include "internal.h"
 
 #define DEBUGFS_DEFAULT_MODE	0700
 
+DEFINE_SRCU(debugfs_srcu);
+
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
@@ -341,8 +346,12 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 		return failed_creating(dentry);
 
 	inode->i_mode = mode;
-	inode->i_fop = fops ? fops : &debugfs_file_operations;
 	inode->i_private = data;
+
+	inode->i_fop = fops ? &debugfs_open_proxy_file_operations
+		: &debugfs_noop_file_operations;
+	dentry->d_fsdata = (void *)fops;
+
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
 	return end_creating(dentry);
@@ -570,6 +579,7 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
@@ -647,6 +657,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
+	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
new file mode 100644
index 000000000000..c7aaa5cb6685
--- /dev/null
+++ b/fs/debugfs/internal.h
@@ -0,0 +1,24 @@
+/*
+ *  internal.h - declarations internal to debugfs
+ *
+ *  Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ */
+
+#ifndef _DEBUGFS_INTERNAL_H_
+#define _DEBUGFS_INTERNAL_H_
+
+struct file_operations;
+struct srcu_struct;
+
+/* declared over in file.c */
+extern const struct file_operations debugfs_noop_file_operations;
+extern const struct file_operations debugfs_open_proxy_file_operations;
+
+extern struct srcu_struct debugfs_srcu;
+
+#endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 981e53ab84e8..fcafe2d389f9 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -43,9 +43,6 @@ extern struct dentry *arch_debugfs_dir;
 
 #if defined(CONFIG_DEBUG_FS)
 
-/* declared over in file.c */
-extern const struct file_operations debugfs_file_operations;
-
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..ddb0e8337aae 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -257,6 +257,7 @@ config PAGE_OWNER
 
 config DEBUG_FS
 	bool "Debug Filesystem"
+	select SRCU
 	help
 	  debugfs is a virtual file system that kernel developers use to put
 	  debugging files into.  Enable this option to be able to read and
-- 
cgit v1.2.3


From 49d200deaa680501f19a247b1fffb29301e51d2b Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 22 Mar 2016 14:11:14 +0100
Subject: debugfs: prevent access to removed files' private data

Upon return of debugfs_remove()/debugfs_remove_recursive(), it might
still be attempted to access associated private file data through
previously opened struct file objects. If that data has been freed by
the caller of debugfs_remove*() in the meanwhile, the reading/writing
process would either encounter a fault or, if the memory address in
question has been reassigned again, unrelated data structures could get
overwritten.

However, since debugfs files are seldomly removed, usually from module
exit handlers only, the impact is very low.

Currently, there are ~1000 call sites of debugfs_create_file() spread
throughout the whole tree and touching all of those struct file_operations
in order to make them file removal aware by means of checking the result of
debugfs_use_file_start() from within their methods is unfeasible.

Instead, wrap the struct file_operations by a lifetime managing proxy at
file open:
- In debugfs_create_file(), the original fops handed in has got stashed
  away in ->d_fsdata already.
- In debugfs_create_file(), install a proxy file_operations factory,
  debugfs_full_proxy_file_operations, at ->i_fop.

This proxy factory has got an ->open() method only. It carries out some
lifetime checks and if successful, dynamically allocates and sets up a new
struct file_operations proxy at ->f_op. Afterwards, it forwards to the
->open() of the original struct file_operations in ->d_fsdata, if any.

The dynamically set up proxy at ->f_op has got a lifetime managing wrapper
set for each of the methods defined in the original struct file_operations
in ->d_fsdata.

Its ->release()er frees the proxy again and forwards to the original
->release(), if any.

In order not to mislead the VFS layer, it is strictly necessary to leave
those fields blank in the proxy that have been NULL in the original
struct file_operations also, i.e. aren't supported. This is why there is a
need for dynamically allocated proxies. The choice made not to allocate a
proxy instance for every dentry at file creation, but for every
struct file object instantiated thereof is justified by the expected usage
pattern of debugfs, namely that in general very few files get opened more
than once at a time.

The wrapper methods set in the struct file_operations implement lifetime
managing by means of the SRCU protection facilities already in place for
debugfs:
They set up a SRCU read side critical section and check whether the dentry
is still alive by means of debugfs_use_file_start(). If so, they forward
the call to the original struct file_operation stored in ->d_fsdata, still
under the protection of the SRCU read side critical section.
This SRCU read side critical section prevents any pending debugfs_remove()
and friends to return to their callers. Since a file's private data must
only be freed after the return of debugfs_remove(), the ongoing proxied
call is guarded against any file removal race.

If, on the other hand, the initial call to debugfs_use_file_start() detects
that the dentry is dead, the wrapper simply returns -EIO and does not
forward the call. Note that the ->poll() wrapper is special in that its
signature does not allow for the return of arbitrary -EXXX values and thus,
POLLHUP is returned here.

In order not to pollute debugfs with wrapper definitions that aren't ever
needed, I chose not to define a wrapper for every struct file_operations
method possible. Instead, a wrapper is defined only for the subset of
methods which are actually set by any debugfs users.
Currently, these are:

  ->llseek()
  ->read()
  ->write()
  ->unlocked_ioctl()
  ->poll()

The ->release() wrapper is special in that it does not protect the original
->release() in any way from dead files in order not to leak resources.
Thus, any ->release() handed to debugfs must implement file lifetime
management manually, if needed.
For only 33 out of a total of 434 releasers handed in to debugfs, it could
not be verified immediately whether they access data structures that might
have been freed upon a debugfs_remove() return in the meanwhile.

Export debugfs_use_file_start() and debugfs_use_file_finish() in order to
allow any ->release() to manually implement file lifetime management.

For a set of common cases of struct file_operations implemented by the
debugfs_core itself, future patches will incorporate file lifetime
management directly within those in order to allow for their unproxied
operation. Rename the original, non-proxying "debugfs_create_file()" to
"debugfs_create_file_unsafe()" and keep it for future internal use by
debugfs itself. Factor out code common to both into the new
__debugfs_create_file().

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 157 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/debugfs/inode.c      |  70 ++++++++++++++-------
 fs/debugfs/internal.h   |   6 +-
 include/linux/debugfs.h |  20 ++++++
 4 files changed, 226 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 736ab3c988f2..6eb58a8ed03c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -23,9 +23,12 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/srcu.h>
+#include <asm/poll.h>
 
 #include "internal.h"
 
+struct poll_table_struct;
+
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
 {
@@ -66,7 +69,7 @@ const struct file_operations debugfs_noop_file_operations = {
  * debugfs_use_file_start() must be followed by a matching call
  * to debugfs_use_file_finish().
  */
-static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 	__acquires(&debugfs_srcu)
 {
 	*srcu_idx = srcu_read_lock(&debugfs_srcu);
@@ -75,6 +78,7 @@ static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 		return -EIO;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(debugfs_use_file_start);
 
 /**
  * debugfs_use_file_finish - mark the end of file data access
@@ -85,10 +89,11 @@ static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
  * debugfs_remove_recursive() blocked by a former call to
  * debugfs_use_file_start() to proceed and return to its caller.
  */
-static void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
 {
 	srcu_read_unlock(&debugfs_srcu, srcu_idx);
 }
+EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
 
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
@@ -131,6 +136,154 @@ const struct file_operations debugfs_open_proxy_file_operations = {
 	.open = open_proxy_open,
 };
 
+#define PROTO(args...) args
+#define ARGS(args...) args
+
+#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args)		\
+static ret_type full_proxy_ ## name(proto)				\
+{									\
+	const struct dentry *dentry = F_DENTRY(filp);			\
+	const struct file_operations *real_fops =			\
+		REAL_FOPS_DEREF(dentry);				\
+	int srcu_idx;							\
+	ret_type r;							\
+									\
+	r = debugfs_use_file_start(dentry, &srcu_idx);			\
+	if (likely(!r))						\
+		r = real_fops->name(args);				\
+	debugfs_use_file_finish(srcu_idx);				\
+	return r;							\
+}
+
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+		PROTO(struct file *filp, loff_t offset, int whence),
+		ARGS(filp, offset, whence));
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+		PROTO(struct file *filp, char __user *buf, size_t size,
+			loff_t *ppos),
+		ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+		PROTO(struct file *filp, const char __user *buf, size_t size,
+			loff_t *ppos),
+		ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
+		PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
+		ARGS(filp, cmd, arg));
+
+static unsigned int full_proxy_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+	int srcu_idx;
+	unsigned int r = 0;
+
+	if (debugfs_use_file_start(dentry, &srcu_idx)) {
+		debugfs_use_file_finish(srcu_idx);
+		return POLLHUP;
+	}
+
+	r = real_fops->poll(filp, wait);
+	debugfs_use_file_finish(srcu_idx);
+	return r;
+}
+
+static int full_proxy_release(struct inode *inode, struct file *filp)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+	const struct file_operations *proxy_fops = filp->f_op;
+	int r = 0;
+
+	/*
+	 * We must not protect this against removal races here: the
+	 * original releaser should be called unconditionally in order
+	 * not to leak any resources. Releasers must not assume that
+	 * ->i_private is still being meaningful here.
+	 */
+	if (real_fops->release)
+		r = real_fops->release(inode, filp);
+
+	replace_fops(filp, d_inode(dentry)->i_fop);
+	kfree((void *)proxy_fops);
+	fops_put(real_fops);
+	return 0;
+}
+
+static void __full_proxy_fops_init(struct file_operations *proxy_fops,
+				const struct file_operations *real_fops)
+{
+	proxy_fops->release = full_proxy_release;
+	if (real_fops->llseek)
+		proxy_fops->llseek = full_proxy_llseek;
+	if (real_fops->read)
+		proxy_fops->read = full_proxy_read;
+	if (real_fops->write)
+		proxy_fops->write = full_proxy_write;
+	if (real_fops->poll)
+		proxy_fops->poll = full_proxy_poll;
+	if (real_fops->unlocked_ioctl)
+		proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
+}
+
+static int full_proxy_open(struct inode *inode, struct file *filp)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = NULL;
+	struct file_operations *proxy_fops = NULL;
+	int srcu_idx, r;
+
+	r = debugfs_use_file_start(dentry, &srcu_idx);
+	if (r) {
+		r = -ENOENT;
+		goto out;
+	}
+
+	real_fops = REAL_FOPS_DEREF(dentry);
+	real_fops = fops_get(real_fops);
+	if (!real_fops) {
+		/* Huh? Module did not cleanup after itself at exit? */
+		WARN(1, "debugfs file owner did not clean up at exit: %pd",
+			dentry);
+		r = -ENXIO;
+		goto out;
+	}
+
+	proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
+	if (!proxy_fops) {
+		r = -ENOMEM;
+		goto free_proxy;
+	}
+	__full_proxy_fops_init(proxy_fops, real_fops);
+	replace_fops(filp, proxy_fops);
+
+	if (real_fops->open) {
+		r = real_fops->open(inode, filp);
+
+		if (filp->f_op != proxy_fops) {
+			/* No protection against file removal anymore. */
+			WARN(1, "debugfs file owner replaced proxy fops: %pd",
+				dentry);
+			goto free_proxy;
+		}
+	}
+
+	goto out;
+free_proxy:
+	kfree(proxy_fops);
+	fops_put(real_fops);
+out:
+	debugfs_use_file_finish(srcu_idx);
+	return r;
+}
+
+const struct file_operations debugfs_full_proxy_file_operations = {
+	.open = full_proxy_open,
+};
+
 static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
 					  struct dentry *parent, void *value,
 				          const struct file_operations *fops,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2905dd160575..136f269f01de 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -300,6 +300,37 @@ static struct dentry *end_creating(struct dentry *dentry)
 	return dentry;
 }
 
+static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
+				struct dentry *parent, void *data,
+				const struct file_operations *proxy_fops,
+				const struct file_operations *real_fops)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	if (!(mode & S_IFMT))
+		mode |= S_IFREG;
+	BUG_ON(!S_ISREG(mode));
+	dentry = start_creating(name, parent);
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = mode;
+	inode->i_private = data;
+
+	inode->i_fop = proxy_fops;
+	dentry->d_fsdata = (void *)real_fops;
+
+	d_instantiate(dentry, inode);
+	fsnotify_create(d_inode(dentry->d_parent), dentry);
+	return end_creating(dentry);
+}
+
 /**
  * debugfs_create_file - create a file in the debugfs filesystem
  * @name: a pointer to a string containing the name of the file to create.
@@ -330,33 +361,24 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
-	struct dentry *dentry;
-	struct inode *inode;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-	BUG_ON(!S_ISREG(mode));
-	dentry = start_creating(name, parent);
-
-	if (IS_ERR(dentry))
-		return NULL;
-
-	inode = debugfs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
-		return failed_creating(dentry);
 
-	inode->i_mode = mode;
-	inode->i_private = data;
+	return __debugfs_create_file(name, mode, parent, data,
+				fops ? &debugfs_full_proxy_file_operations :
+					&debugfs_noop_file_operations,
+				fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file);
 
-	inode->i_fop = fops ? &debugfs_open_proxy_file_operations
-		: &debugfs_noop_file_operations;
-	dentry->d_fsdata = (void *)fops;
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops)
+{
 
-	d_instantiate(dentry, inode);
-	fsnotify_create(d_inode(dentry->d_parent), dentry);
-	return end_creating(dentry);
+	return __debugfs_create_file(name, mode, parent, data,
+				fops ? &debugfs_open_proxy_file_operations :
+					&debugfs_noop_file_operations,
+				fops);
 }
-EXPORT_SYMBOL_GPL(debugfs_create_file);
 
 /**
  * debugfs_create_file_size - create a file in the debugfs filesystem
@@ -579,6 +601,7 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+
 	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -657,6 +680,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
+
 	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index c7aaa5cb6685..bba52634b995 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -13,12 +13,14 @@
 #define _DEBUGFS_INTERNAL_H_
 
 struct file_operations;
-struct srcu_struct;
 
 /* declared over in file.c */
 extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
+extern const struct file_operations debugfs_full_proxy_file_operations;
 
-extern struct srcu_struct debugfs_srcu;
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops);
 
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index fcafe2d389f9..a63e6ea3321c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -19,9 +19,11 @@
 #include <linux/seq_file.h>
 
 #include <linux/types.h>
+#include <linux/compiler.h>
 
 struct device;
 struct file_operations;
+struct srcu_struct;
 
 struct debugfs_blob_wrapper {
 	void *data;
@@ -41,6 +43,8 @@ struct debugfs_regset32 {
 
 extern struct dentry *arch_debugfs_dir;
 
+extern struct srcu_struct debugfs_srcu;
+
 #if defined(CONFIG_DEBUG_FS)
 
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
@@ -65,6 +69,11 @@ struct dentry *debugfs_create_automount(const char *name,
 void debugfs_remove(struct dentry *dentry);
 void debugfs_remove_recursive(struct dentry *dentry);
 
+int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+	__acquires(&debugfs_srcu);
+
+void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
+
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
 
@@ -173,6 +182,17 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
+static inline int debugfs_use_file_start(const struct dentry *dentry,
+					int *srcu_idx)
+	__acquires(&debugfs_srcu)
+{
+	return 0;
+}
+
+static inline void debugfs_use_file_finish(int srcu_idx)
+	__releases(&debugfs_srcu)
+{ }
+
 static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, char *new_name)
 {
-- 
cgit v1.2.3


From c64688081490321f2d23a292ef24e60bb321f3f1 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 22 Mar 2016 14:11:15 +0100
Subject: debugfs: add support for self-protecting attribute file fops

In order to protect them against file removal issues, debugfs_create_file()
creates a lifetime managing proxy around each struct file_operations
handed in.

In cases where this struct file_operations is able to manage file lifetime
by itself already, the proxy created by debugfs is a waste of resources.

The most common class of struct file_operations given to debugfs are those
defined by means of the DEFINE_SIMPLE_ATTRIBUTE() macro.

Introduce a DEFINE_DEBUGFS_ATTRIBUTE() macro to allow any
struct file_operations of this class to be easily made file lifetime aware
and thus, to be operated unproxied.

Specifically, introduce debugfs_attr_read() and debugfs_attr_write()
which wrap simple_attr_read() and simple_attr_write() under the protection
of a debugfs_use_file_start()/debugfs_use_file_finish() pair.

Make DEFINE_DEBUGFS_ATTRIBUTE() set the defined struct file_operations'
->read() and ->write() members to these wrappers.

Export debugfs_create_file_unsafe() in order to allow debugfs users to
create their files in non-proxying operation mode.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 28 ++++++++++++++++++++++++++++
 fs/debugfs/inode.c      | 28 ++++++++++++++++++++++++++++
 include/linux/debugfs.h | 26 ++++++++++++++++++++++++++
 3 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6eb58a8ed03c..8ef56d9499a4 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -284,6 +284,34 @@ const struct file_operations debugfs_full_proxy_file_operations = {
 	.open = full_proxy_open,
 };
 
+ssize_t debugfs_attr_read(struct file *file, char __user *buf,
+			size_t len, loff_t *ppos)
+{
+	ssize_t ret;
+	int srcu_idx;
+
+	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+	if (likely(!ret))
+		ret = simple_attr_read(file, buf, len, ppos);
+	debugfs_use_file_finish(srcu_idx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_read);
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+			 size_t len, loff_t *ppos)
+{
+	ssize_t ret;
+	int srcu_idx;
+
+	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+	if (likely(!ret))
+		ret = simple_attr_write(file, buf, len, ppos);
+	debugfs_use_file_finish(srcu_idx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write);
+
 static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
 					  struct dentry *parent, void *value,
 				          const struct file_operations *fops,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 136f269f01de..41e079a8da26 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -369,6 +369,33 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
 
+/**
+ * debugfs_create_file_unsafe - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * debugfs_create_file_unsafe() is completely analogous to
+ * debugfs_create_file(), the only difference being that the fops
+ * handed it will not get protected against file removals by the
+ * debugfs core.
+ *
+ * It is your responsibility to protect your struct file_operation
+ * methods against file removals by means of debugfs_use_file_start()
+ * and debugfs_use_file_finish(). ->open() is still protected by
+ * debugfs though.
+ *
+ * Any struct file_operations defined by means of
+ * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
+ * thus, may be used here.
+ */
 struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
@@ -379,6 +406,7 @@ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 					&debugfs_noop_file_operations,
 				fops);
 }
+EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
 
 /**
  * debugfs_create_file_size - create a file in the debugfs filesystem
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index a63e6ea3321c..1438e2322d5c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -50,6 +50,9 @@ extern struct srcu_struct debugfs_srcu;
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops);
 
 struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
@@ -74,6 +77,26 @@ int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 
 void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
 
+ssize_t debugfs_attr_read(struct file *file, char __user *buf,
+			size_t len, loff_t *ppos);
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+			size_t len, loff_t *ppos);
+
+#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)		\
+static int __fops ## _open(struct inode *inode, struct file *file)	\
+{									\
+	__simple_attr_check_format(__fmt, 0ull);			\
+	return simple_attr_open(inode, file, __get, __set, __fmt);	\
+}									\
+static const struct file_operations __fops = {				\
+	.owner	 = THIS_MODULE,					\
+	.open	 = __fops ## _open,					\
+	.release = simple_attr_release,				\
+	.read	 = debugfs_attr_read,					\
+	.write	 = debugfs_attr_write,					\
+	.llseek  = generic_file_llseek,				\
+}
+
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
 
@@ -193,6 +216,9 @@ static inline void debugfs_use_file_finish(int srcu_idx)
 	__releases(&debugfs_srcu)
 { }
 
+#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)	\
+	static const struct file_operations __fops = { 0 }
+
 static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, char *new_name)
 {
-- 
cgit v1.2.3


From c422025c185fb2bb28df65b1bbed7953480c7f87 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 18 Mar 2016 16:24:41 +0200
Subject: dmaengine: dw: rename masters to reflect actual topology

The source and destination masters are reflecting buses or their layers to
where the different devices can be connected. The patch changes the master
names to reflect which one is related to which independently on the transfer
direction.

The outcome of the change is that the memory data width is now always limited
by a data width of the master which is dedicated to communicate to memory.

The patch will not break anything since all current users have the same data
width for all masters. Though it would be nice to revisit avr32 platforms to
check what is the actual hardware topology in use there. It seems that it has
one bus and two masters on it as stated by Table 8-2, that's why everything
works independently on the master in use. The purpose of the sequential patch
is to fix the driver for configuration of more than one bus.

The change is done in the assumption that src_master and dst_master are
reflecting a connection to the memory and peripheral correspondently on avr32
and otherwise on the rest.

Acked-by: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/snps-dma.txt |  4 ++--
 arch/avr32/mach-at32ap/at32ap700x.c                | 16 ++++++++--------
 drivers/ata/sata_dwc_460ex.c                       |  4 ++--
 drivers/dma/dw/core.c                              | 19 +++++++++----------
 drivers/dma/dw/platform.c                          | 12 ++++++------
 drivers/dma/dw/regs.h                              |  4 ++--
 drivers/spi/spi-pxa2xx-pci.c                       |  8 ++++----
 drivers/tty/serial/8250/8250_pci.c                 |  8 ++++----
 include/linux/platform_data/dma-dw.h               |  8 ++++----
 9 files changed, 41 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/snps-dma.txt b/Documentation/devicetree/bindings/dma/snps-dma.txt
index c261598164a7..c99c1ffac199 100644
--- a/Documentation/devicetree/bindings/dma/snps-dma.txt
+++ b/Documentation/devicetree/bindings/dma/snps-dma.txt
@@ -47,8 +47,8 @@ The four cells in order are:
 
 1. A phandle pointing to the DMA controller
 2. The DMA request line number
-3. Source master for transfers on allocated channel
-4. Destination master for transfers on allocated channel
+3. Memory master for transfers on allocated channel
+4. Peripheral master for transfers on allocated channel
 
 Example:
 	
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index bf445aa48282..00d6dcc1d9b6 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -1365,8 +1365,8 @@ at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
 	slave->dma_dev = &dw_dmac0_device.dev;
 	slave->src_id = 0;
 	slave->dst_id = 1;
-	slave->src_master = 1;
-	slave->dst_master = 0;
+	slave->m_master = 1;
+	slave->p_master = 0;
 
 	data->dma_slave = slave;
 	data->dma_filter = at32_mci_dma_filter;
@@ -2061,16 +2061,16 @@ at32_add_device_ac97c(unsigned int id, struct ac97c_platform_data *data,
 	if (flags & AC97C_CAPTURE) {
 		rx_dws->dma_dev = &dw_dmac0_device.dev;
 		rx_dws->src_id = 3;
-		rx_dws->src_master = 0;
-		rx_dws->dst_master = 1;
+		rx_dws->m_master = 0;
+		rx_dws->p_master = 1;
 	}
 
 	/* Check if DMA slave interface for playback should be configured. */
 	if (flags & AC97C_PLAYBACK) {
 		tx_dws->dma_dev = &dw_dmac0_device.dev;
 		tx_dws->dst_id = 4;
-		tx_dws->src_master = 0;
-		tx_dws->dst_master = 1;
+		tx_dws->m_master = 0;
+		tx_dws->p_master = 1;
 	}
 
 	if (platform_device_add_data(pdev, data,
@@ -2141,8 +2141,8 @@ at32_add_device_abdac(unsigned int id, struct atmel_abdac_pdata *data)
 
 	dws->dma_dev = &dw_dmac0_device.dev;
 	dws->dst_id = 2;
-	dws->src_master = 0;
-	dws->dst_master = 1;
+	dws->m_master = 0;
+	dws->p_master = 1;
 
 	if (platform_device_add_data(pdev, data,
 				sizeof(struct atmel_abdac_pdata)))
diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 902034991517..80bdcabc293f 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -201,8 +201,8 @@ static struct sata_dwc_host_priv host_pvt;
 static struct dw_dma_slave sata_dwc_dma_dws = {
 	.src_id = 0,
 	.dst_id = 0,
-	.src_master = 0,
-	.dst_master = 1,
+	.m_master = 1,
+	.p_master = 0,
 };
 
 /*
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 97199b3c25a2..5bd7873a02c6 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -50,8 +50,8 @@
 		 | DWC_CTLL_SRC_MSIZE(_smsize)			\
 		 | DWC_CTLL_LLP_D_EN				\
 		 | DWC_CTLL_LLP_S_EN				\
-		 | DWC_CTLL_DMS(_dwc->dst_master)		\
-		 | DWC_CTLL_SMS(_dwc->src_master));		\
+		 | DWC_CTLL_DMS(_dwc->p_master)			\
+		 | DWC_CTLL_SMS(_dwc->m_master));		\
 	})
 
 /*
@@ -709,8 +709,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 	dwc->direction = DMA_MEM_TO_MEM;
 
-	data_width = min_t(unsigned int, dw->data_width[dwc->src_master],
-			   dw->data_width[dwc->dst_master]);
+	data_width = dw->data_width[dwc->m_master];
 
 	src_width = dst_width = min_t(unsigned int, data_width,
 				      dwc_fast_ffs(src | dest | len));
@@ -802,7 +801,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
 			DWC_CTLL_FC(DW_DMA_FC_D_M2P);
 
-		data_width = dw->data_width[dwc->src_master];
+		data_width = dw->data_width[dwc->m_master];
 
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
@@ -859,7 +858,7 @@ slave_sg_todev_fill_desc:
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
 			DWC_CTLL_FC(DW_DMA_FC_D_P2M);
 
-		data_width = dw->data_width[dwc->dst_master];
+		data_width = dw->data_width[dwc->m_master];
 
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
@@ -937,8 +936,8 @@ bool dw_dma_filter(struct dma_chan *chan, void *param)
 	dwc->src_id = dws->src_id;
 	dwc->dst_id = dws->dst_id;
 
-	dwc->src_master = dws->src_master;
-	dwc->dst_master = dws->dst_master;
+	dwc->m_master = dws->m_master;
+	dwc->p_master = dws->p_master;
 
 	return true;
 }
@@ -1227,8 +1226,8 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	dwc->src_id = 0;
 	dwc->dst_id = 0;
 
-	dwc->src_master = 0;
-	dwc->dst_master = 0;
+	dwc->m_master = 0;
+	dwc->p_master = 0;
 
 	dwc->initialized = false;
 
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 26edbe3a27ac..23616c57645c 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -42,13 +42,13 @@ static struct dma_chan *dw_dma_of_xlate(struct of_phandle_args *dma_spec,
 
 	slave.src_id = dma_spec->args[0];
 	slave.dst_id = dma_spec->args[0];
-	slave.src_master = dma_spec->args[1];
-	slave.dst_master = dma_spec->args[2];
+	slave.m_master = dma_spec->args[1];
+	slave.p_master = dma_spec->args[2];
 
 	if (WARN_ON(slave.src_id >= DW_DMA_MAX_NR_REQUESTS ||
 		    slave.dst_id >= DW_DMA_MAX_NR_REQUESTS ||
-		    slave.src_master >= dw->nr_masters ||
-		    slave.dst_master >= dw->nr_masters))
+		    slave.m_master >= dw->nr_masters ||
+		    slave.p_master >= dw->nr_masters))
 		return NULL;
 
 	dma_cap_zero(cap);
@@ -66,8 +66,8 @@ static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param)
 		.dma_dev = dma_spec->dev,
 		.src_id = dma_spec->slave_id,
 		.dst_id = dma_spec->slave_id,
-		.src_master = 1,
-		.dst_master = 0,
+		.m_master = 0,
+		.p_master = 1,
 	};
 
 	return dw_dma_filter(chan, &slave);
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 0a50c18d85b8..a63d62bbffe2 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -249,8 +249,8 @@ struct dw_dma_chan {
 	/* custom slave configuration */
 	u8			src_id;
 	u8			dst_id;
-	u8			src_master;
-	u8			dst_master;
+	u8			m_master;
+	u8			p_master;
 
 	/* configuration passed via .device_config */
 	struct dma_slave_config dma_sconfig;
diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index 520ed1dd5780..4fd7f9802f1b 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -144,16 +144,16 @@ static int pxa2xx_spi_pci_probe(struct pci_dev *dev,
 		struct dw_dma_slave *slave = c->tx_param;
 
 		slave->dma_dev = &dma_dev->dev;
-		slave->src_master = 1;
-		slave->dst_master = 0;
+		slave->m_master = 0;
+		slave->p_master = 1;
 	}
 
 	if (c->rx_param) {
 		struct dw_dma_slave *slave = c->rx_param;
 
 		slave->dma_dev = &dma_dev->dev;
-		slave->src_master = 1;
-		slave->dst_master = 0;
+		slave->m_master = 0;
+		slave->p_master = 1;
 	}
 
 	spi_pdata.dma_filter = lpss_dma_filter;
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 98862aa5bb58..5eea74d7f9f4 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1454,13 +1454,13 @@ byt_serial_setup(struct serial_private *priv,
 		return -EINVAL;
 	}
 
-	rx_param->src_master = 1;
-	rx_param->dst_master = 0;
+	rx_param->m_master = 0;
+	rx_param->p_master = 1;
 
 	dma->rxconf.src_maxburst = 16;
 
-	tx_param->src_master = 1;
-	tx_param->dst_master = 0;
+	tx_param->m_master = 0;
+	tx_param->p_master = 1;
 
 	dma->txconf.dst_maxburst = 16;
 
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index 03b6095d3b18..b881b978e486 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -21,15 +21,15 @@
  * @dma_dev:	required DMA master device
  * @src_id:	src request line
  * @dst_id:	dst request line
- * @src_master: src master for transfers on allocated channel.
- * @dst_master: dest master for transfers on allocated channel.
+ * @m_master:	memory master for transfers on allocated channel
+ * @p_master:	peripheral master for transfers on allocated channel
  */
 struct dw_dma_slave {
 	struct device		*dma_dev;
 	u8			src_id;
 	u8			dst_id;
-	u8			src_master;
-	u8			dst_master;
+	u8			m_master;
+	u8			p_master;
 };
 
 /**
-- 
cgit v1.2.3


From 58bc67fc32b1c67fb045f4828a67134dc8fee631 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 10 Jul 2015 15:23:28 +0300
Subject: ARM: OMAP2+: gpmc: Add platform data

Add a platform data structure for GPMC. It contains all the necessary
platform information that needs to be passed from platform init code
to GPMC driver.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/omap-gpmc.h               |  3 +--
 include/linux/platform_data/gpmc-omap.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/platform_data/gpmc-omap.h

(limited to 'include/linux')

diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index d833eb4dd446..45d9075be1e5 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -7,8 +7,7 @@
  *  option) any later version.
  */
 
-/* Maximum Number of Chip Selects */
-#define GPMC_CS_NUM		8
+#include <linux/platform_data/gpmc-omap.h>
 
 #define GPMC_CONFIG_WP		0x00000005
 
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
new file mode 100644
index 000000000000..6804a8b387d7
--- /dev/null
+++ b/include/linux/platform_data/gpmc-omap.h
@@ -0,0 +1,30 @@
+/*
+ * OMAP GPMC Platform data
+ *
+ * Copyright (C) 2014 Texas Instruments, Inc. - http://www.ti.com
+ *	Roger Quadros <rogerq@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _GPMC_OMAP_H_
+#define _GPMC_OMAP_H_
+
+/* Maximum Number of Chip Selects */
+#define GPMC_CS_NUM		8
+
+/* Data for each chip select */
+struct gpmc_omap_cs_data {
+	bool valid;			/* data is valid */
+	bool is_nand;			/* device within this CS is NAND */
+	struct platform_device *pdev;	/* device within this CS region */
+	unsigned int pdata_size;
+};
+
+struct gpmc_omap_platform_data {
+	struct gpmc_omap_cs_data cs[GPMC_CS_NUM];
+};
+
+#endif /* _GPMC_OMAP_H */
-- 
cgit v1.2.3


From fabe7d7756d17f5da4bd80fa2373c4bd93ed39e5 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 10 Jul 2015 15:23:29 +0300
Subject: ARM: OMAP2+: gpmc: Add gpmc timings and settings to platform data

Add device_timings, gpmc_timings and gpmc_setting to
gpmc platform data.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/omap-gpmc.h               | 139 -------------------------------
 include/linux/platform_data/gpmc-omap.h | 142 ++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index 45d9075be1e5..2dcef1c8c8d4 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -14,145 +14,6 @@
 #define GPMC_IRQ_FIFOEVENTENABLE	0x01
 #define GPMC_IRQ_COUNT_EVENT		0x02
 
-#define GPMC_BURST_4			4	/* 4 word burst */
-#define GPMC_BURST_8			8	/* 8 word burst */
-#define GPMC_BURST_16			16	/* 16 word burst */
-#define GPMC_DEVWIDTH_8BIT		1	/* 8-bit device width */
-#define GPMC_DEVWIDTH_16BIT		2	/* 16-bit device width */
-#define GPMC_MUX_AAD			1	/* Addr-Addr-Data multiplex */
-#define GPMC_MUX_AD			2	/* Addr-Data multiplex */
-
-/* bool type time settings */
-struct gpmc_bool_timings {
-	bool cycle2cyclediffcsen;
-	bool cycle2cyclesamecsen;
-	bool we_extra_delay;
-	bool oe_extra_delay;
-	bool adv_extra_delay;
-	bool cs_extra_delay;
-	bool time_para_granularity;
-};
-
-/*
- * Note that all values in this struct are in nanoseconds except sync_clk
- * (which is in picoseconds), while the register values are in gpmc_fck cycles.
- */
-struct gpmc_timings {
-	/* Minimum clock period for synchronous mode (in picoseconds) */
-	u32 sync_clk;
-
-	/* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
-	u32 cs_on;		/* Assertion time */
-	u32 cs_rd_off;		/* Read deassertion time */
-	u32 cs_wr_off;		/* Write deassertion time */
-
-	/* ADV signal timings corresponding to GPMC_CONFIG3 */
-	u32 adv_on;		/* Assertion time */
-	u32 adv_rd_off;		/* Read deassertion time */
-	u32 adv_wr_off;		/* Write deassertion time */
-	u32 adv_aad_mux_on;	/* ADV assertion time for AAD */
-	u32 adv_aad_mux_rd_off;	/* ADV read deassertion time for AAD */
-	u32 adv_aad_mux_wr_off;	/* ADV write deassertion time for AAD */
-
-	/* WE signals timings corresponding to GPMC_CONFIG4 */
-	u32 we_on;		/* WE assertion time */
-	u32 we_off;		/* WE deassertion time */
-
-	/* OE signals timings corresponding to GPMC_CONFIG4 */
-	u32 oe_on;		/* OE assertion time */
-	u32 oe_off;		/* OE deassertion time */
-	u32 oe_aad_mux_on;	/* OE assertion time for AAD */
-	u32 oe_aad_mux_off;	/* OE deassertion time for AAD */
-
-	/* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
-	u32 page_burst_access;	/* Multiple access word delay */
-	u32 access;		/* Start-cycle to first data valid delay */
-	u32 rd_cycle;		/* Total read cycle time */
-	u32 wr_cycle;		/* Total write cycle time */
-
-	u32 bus_turnaround;
-	u32 cycle2cycle_delay;
-
-	u32 wait_monitoring;
-	u32 clk_activation;
-
-	/* The following are only on OMAP3430 */
-	u32 wr_access;		/* WRACCESSTIME */
-	u32 wr_data_mux_bus;	/* WRDATAONADMUXBUS */
-
-	struct gpmc_bool_timings bool_timings;
-};
-
-/* Device timings in picoseconds */
-struct gpmc_device_timings {
-	u32 t_ceasu;	/* address setup to CS valid */
-	u32 t_avdasu;	/* address setup to ADV valid */
-	/* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
-	 * of tusb using these timings even for sync whilst
-	 * ideally for adv_rd/(wr)_off it should have considered
-	 * t_avdh instead. This indirectly necessitates r/w
-	 * variations of t_avdp as it is possible to have one
-	 * sync & other async
-	 */
-	u32 t_avdp_r;	/* ADV low time (what about t_cer ?) */
-	u32 t_avdp_w;
-	u32 t_aavdh;	/* address hold time */
-	u32 t_oeasu;	/* address setup to OE valid */
-	u32 t_aa;	/* access time from ADV assertion */
-	u32 t_iaa;	/* initial access time */
-	u32 t_oe;	/* access time from OE assertion */
-	u32 t_ce;	/* access time from CS asertion */
-	u32 t_rd_cycle;	/* read cycle time */
-	u32 t_cez_r;	/* read CS deassertion to high Z */
-	u32 t_cez_w;	/* write CS deassertion to high Z */
-	u32 t_oez;	/* OE deassertion to high Z */
-	u32 t_weasu;	/* address setup to WE valid */
-	u32 t_wpl;	/* write assertion time */
-	u32 t_wph;	/* write deassertion time */
-	u32 t_wr_cycle;	/* write cycle time */
-
-	u32 clk;
-	u32 t_bacc;	/* burst access valid clock to output delay */
-	u32 t_ces;	/* CS setup time to clk */
-	u32 t_avds;	/* ADV setup time to clk */
-	u32 t_avdh;	/* ADV hold time from clk */
-	u32 t_ach;	/* address hold time from clk */
-	u32 t_rdyo;	/* clk to ready valid */
-
-	u32 t_ce_rdyz;	/* XXX: description ?, or use t_cez instead */
-	u32 t_ce_avd;	/* CS on to ADV on delay */
-
-	/* XXX: check the possibility of combining
-	 * cyc_aavhd_oe & cyc_aavdh_we
-	 */
-	u8 cyc_aavdh_oe;/* read address hold time in cycles */
-	u8 cyc_aavdh_we;/* write address hold time in cycles */
-	u8 cyc_oe;	/* access time from OE assertion in cycles */
-	u8 cyc_wpl;	/* write deassertion time in cycles */
-	u32 cyc_iaa;	/* initial access time in cycles */
-
-	/* extra delays */
-	bool ce_xdelay;
-	bool avd_xdelay;
-	bool oe_xdelay;
-	bool we_xdelay;
-};
-
-struct gpmc_settings {
-	bool burst_wrap;	/* enables wrap bursting */
-	bool burst_read;	/* enables read page/burst mode */
-	bool burst_write;	/* enables write page/burst mode */
-	bool device_nand;	/* device is NAND */
-	bool sync_read;		/* enables synchronous reads */
-	bool sync_write;	/* enables synchronous writes */
-	bool wait_on_read;	/* monitor wait on reads */
-	bool wait_on_write;	/* monitor wait on writes */
-	u32 burst_len;		/* page/burst length */
-	u32 device_width;	/* device bus width (8 or 16 bit) */
-	u32 mux_add_data;	/* multiplex address & data */
-	u32 wait_pin;		/* wait-pin to be used */
-};
-
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
 			     struct gpmc_settings *gpmc_s,
 			     struct gpmc_device_timings *dev_t);
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
index 6804a8b387d7..67ccdb0e1606 100644
--- a/include/linux/platform_data/gpmc-omap.h
+++ b/include/linux/platform_data/gpmc-omap.h
@@ -15,10 +15,152 @@
 /* Maximum Number of Chip Selects */
 #define GPMC_CS_NUM		8
 
+/* bool type time settings */
+struct gpmc_bool_timings {
+	bool cycle2cyclediffcsen;
+	bool cycle2cyclesamecsen;
+	bool we_extra_delay;
+	bool oe_extra_delay;
+	bool adv_extra_delay;
+	bool cs_extra_delay;
+	bool time_para_granularity;
+};
+
+/*
+ * Note that all values in this struct are in nanoseconds except sync_clk
+ * (which is in picoseconds), while the register values are in gpmc_fck cycles.
+ */
+struct gpmc_timings {
+	/* Minimum clock period for synchronous mode (in picoseconds) */
+	u32 sync_clk;
+
+	/* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
+	u32 cs_on;		/* Assertion time */
+	u32 cs_rd_off;		/* Read deassertion time */
+	u32 cs_wr_off;		/* Write deassertion time */
+
+	/* ADV signal timings corresponding to GPMC_CONFIG3 */
+	u32 adv_on;		/* Assertion time */
+	u32 adv_rd_off;		/* Read deassertion time */
+	u32 adv_wr_off;		/* Write deassertion time */
+	u32 adv_aad_mux_on;	/* ADV assertion time for AAD */
+	u32 adv_aad_mux_rd_off;	/* ADV read deassertion time for AAD */
+	u32 adv_aad_mux_wr_off;	/* ADV write deassertion time for AAD */
+
+	/* WE signals timings corresponding to GPMC_CONFIG4 */
+	u32 we_on;		/* WE assertion time */
+	u32 we_off;		/* WE deassertion time */
+
+	/* OE signals timings corresponding to GPMC_CONFIG4 */
+	u32 oe_on;		/* OE assertion time */
+	u32 oe_off;		/* OE deassertion time */
+	u32 oe_aad_mux_on;	/* OE assertion time for AAD */
+	u32 oe_aad_mux_off;	/* OE deassertion time for AAD */
+
+	/* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
+	u32 page_burst_access;	/* Multiple access word delay */
+	u32 access;		/* Start-cycle to first data valid delay */
+	u32 rd_cycle;		/* Total read cycle time */
+	u32 wr_cycle;		/* Total write cycle time */
+
+	u32 bus_turnaround;
+	u32 cycle2cycle_delay;
+
+	u32 wait_monitoring;
+	u32 clk_activation;
+
+	/* The following are only on OMAP3430 */
+	u32 wr_access;		/* WRACCESSTIME */
+	u32 wr_data_mux_bus;	/* WRDATAONADMUXBUS */
+
+	struct gpmc_bool_timings bool_timings;
+};
+
+/* Device timings in picoseconds */
+struct gpmc_device_timings {
+	u32 t_ceasu;	/* address setup to CS valid */
+	u32 t_avdasu;	/* address setup to ADV valid */
+	/* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
+	 * of tusb using these timings even for sync whilst
+	 * ideally for adv_rd/(wr)_off it should have considered
+	 * t_avdh instead. This indirectly necessitates r/w
+	 * variations of t_avdp as it is possible to have one
+	 * sync & other async
+	 */
+	u32 t_avdp_r;	/* ADV low time (what about t_cer ?) */
+	u32 t_avdp_w;
+	u32 t_aavdh;	/* address hold time */
+	u32 t_oeasu;	/* address setup to OE valid */
+	u32 t_aa;	/* access time from ADV assertion */
+	u32 t_iaa;	/* initial access time */
+	u32 t_oe;	/* access time from OE assertion */
+	u32 t_ce;	/* access time from CS asertion */
+	u32 t_rd_cycle;	/* read cycle time */
+	u32 t_cez_r;	/* read CS deassertion to high Z */
+	u32 t_cez_w;	/* write CS deassertion to high Z */
+	u32 t_oez;	/* OE deassertion to high Z */
+	u32 t_weasu;	/* address setup to WE valid */
+	u32 t_wpl;	/* write assertion time */
+	u32 t_wph;	/* write deassertion time */
+	u32 t_wr_cycle;	/* write cycle time */
+
+	u32 clk;
+	u32 t_bacc;	/* burst access valid clock to output delay */
+	u32 t_ces;	/* CS setup time to clk */
+	u32 t_avds;	/* ADV setup time to clk */
+	u32 t_avdh;	/* ADV hold time from clk */
+	u32 t_ach;	/* address hold time from clk */
+	u32 t_rdyo;	/* clk to ready valid */
+
+	u32 t_ce_rdyz;	/* XXX: description ?, or use t_cez instead */
+	u32 t_ce_avd;	/* CS on to ADV on delay */
+
+	/* XXX: check the possibility of combining
+	 * cyc_aavhd_oe & cyc_aavdh_we
+	 */
+	u8 cyc_aavdh_oe;/* read address hold time in cycles */
+	u8 cyc_aavdh_we;/* write address hold time in cycles */
+	u8 cyc_oe;	/* access time from OE assertion in cycles */
+	u8 cyc_wpl;	/* write deassertion time in cycles */
+	u32 cyc_iaa;	/* initial access time in cycles */
+
+	/* extra delays */
+	bool ce_xdelay;
+	bool avd_xdelay;
+	bool oe_xdelay;
+	bool we_xdelay;
+};
+
+#define GPMC_BURST_4			4	/* 4 word burst */
+#define GPMC_BURST_8			8	/* 8 word burst */
+#define GPMC_BURST_16			16	/* 16 word burst */
+#define GPMC_DEVWIDTH_8BIT		1	/* 8-bit device width */
+#define GPMC_DEVWIDTH_16BIT		2	/* 16-bit device width */
+#define GPMC_MUX_AAD			1	/* Addr-Addr-Data multiplex */
+#define GPMC_MUX_AD			2	/* Addr-Data multiplex */
+
+struct gpmc_settings {
+	bool burst_wrap;	/* enables wrap bursting */
+	bool burst_read;	/* enables read page/burst mode */
+	bool burst_write;	/* enables write page/burst mode */
+	bool device_nand;	/* device is NAND */
+	bool sync_read;		/* enables synchronous reads */
+	bool sync_write;	/* enables synchronous writes */
+	bool wait_on_read;	/* monitor wait on reads */
+	bool wait_on_write;	/* monitor wait on writes */
+	u32 burst_len;		/* page/burst length */
+	u32 device_width;	/* device bus width (8 or 16 bit) */
+	u32 mux_add_data;	/* multiplex address & data */
+	u32 wait_pin;		/* wait-pin to be used */
+};
+
 /* Data for each chip select */
 struct gpmc_omap_cs_data {
 	bool valid;			/* data is valid */
 	bool is_nand;			/* device within this CS is NAND */
+	struct gpmc_settings *settings;
+	struct gpmc_device_timings *device_timings;
+	struct gpmc_timings *gpmc_timings;
 	struct platform_device *pdev;	/* device within this CS region */
 	unsigned int pdata_size;
 };
-- 
cgit v1.2.3


From f47fcad63f6847ea677c6c7030f30fd6438e0052 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 5 Aug 2015 13:58:01 +0300
Subject: memory: omap-gpmc: Introduce GPMC to NAND interface

The OMAP GPMC module has certain registers dedicated for NAND
access and some NAND bits mixed with other GPMC functionality.

For the NAND dedicated registers we have the struct gpmc_nand_regs.

The NAND driver needs to access NAND specific bits from the
following non-dedicated registers
- EMPTYWRITEBUFFERSTATUS from GPMC_STATUS

For accessing these bits we introduce the struct gpmc_nand_ops.

Add gpmc_omap_get_nand_ops() that returns the gpmc_nand_ops along
with updating the gpmc_nand_regs. This API will be called by the
OMAP NAND driver to access the necessary bits in GPMC register space.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 drivers/memory/omap-gpmc.c | 21 +++++++++++++++++++++
 include/linux/omap-gpmc.h  | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index 21825ddce4a3..0b62afd86f7e 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -1118,6 +1118,27 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 	}
 }
 
+static struct gpmc_nand_ops nand_ops;
+
+/**
+ * gpmc_omap_get_nand_ops - Get the GPMC NAND interface
+ * @regs: the GPMC NAND register map exclusive for NAND use.
+ * @cs: GPMC chip select number on which the NAND sits. The
+ *      register map returned will be specific to this chip select.
+ *
+ * Returns NULL on error e.g. invalid cs.
+ */
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *reg, int cs)
+{
+	if (cs >= gpmc_cs_num)
+		return NULL;
+
+	gpmc_update_nand_reg(reg, cs);
+
+	return &nand_ops;
+}
+EXPORT_SYMBOL_GPL(gpmc_omap_get_nand_ops);
+
 int gpmc_get_client_irq(unsigned irq_config)
 {
 	int i;
diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index 2dcef1c8c8d4..dc2ada6fb9b4 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -14,14 +14,45 @@
 #define GPMC_IRQ_FIFOEVENTENABLE	0x01
 #define GPMC_IRQ_COUNT_EVENT		0x02
 
+/**
+ * gpmc_nand_ops - Interface between NAND and GPMC
+ * @nand_write_buffer_empty: get the NAND write buffer empty status.
+ */
+struct gpmc_nand_ops {
+	bool (*nand_writebuffer_empty)(void);
+};
+
+struct gpmc_nand_regs;
+
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+					     int cs);
+#else
+static inline gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+						    int cs)
+{
+	return NULL;
+}
+#endif /* CONFIG_OMAP_GPMC */
+
+/*--------------------------------*/
+
+/* deprecated APIs */
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
+#else
+static inline void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
+{
+}
+#endif /* CONFIG_OMAP_GPMC */
+/*--------------------------------*/
+
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
 			     struct gpmc_settings *gpmc_s,
 			     struct gpmc_device_timings *dev_t);
 
-struct gpmc_nand_regs;
 struct device_node;
 
-extern void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
 extern int gpmc_get_client_irq(unsigned irq_config);
 
 extern unsigned int gpmc_ticks_to_ns(unsigned int ticks);
-- 
cgit v1.2.3


From 384258f252727c67772bbd48dad3185a30ba50d3 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Thu, 30 Jul 2015 14:49:23 +0300
Subject: memory: omap-gpmc: Implement IRQ domain for NAND IRQs

GPMC provides 2 interrupts for NAND use. i.e. fifoevent and termcount.
Use IRQ domain for this. NAND device tree node can then
get the necessary interrupts by using gpmc as the interrupt parent.

Legacy boot uses gpmc_get_client_irq to get the
NAND interrupts from the GPMC IRQ domain.
Get rid of custom bitmasks and use IRQ domain for that
as well.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 Documentation/devicetree/bindings/bus/ti-gpmc.txt |   8 +
 drivers/memory/omap-gpmc.c                        | 246 ++++++++++++----------
 include/linux/omap-gpmc.h                         |   5 +-
 3 files changed, 144 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/bus/ti-gpmc.txt b/Documentation/devicetree/bindings/bus/ti-gpmc.txt
index 01683707060b..13f13786f992 100644
--- a/Documentation/devicetree/bindings/bus/ti-gpmc.txt
+++ b/Documentation/devicetree/bindings/bus/ti-gpmc.txt
@@ -32,6 +32,12 @@ Required properties:
 			bootloader) are used for the physical address decoding.
 			As this will change in the future, filling correct
 			values here is a requirement.
+ - interrupt-controller: The GPMC driver implements and interrupt controller for
+			the NAND events "fifoevent" and "termcount".
+			The interrupt number mapping is as follows
+			0 - NAND_fifoevent
+			1 - NAND_termcount
+ - interrupt-cells:	Must be set to 2
 
 Timing properties for child nodes. All are optional and default to 0.
 
@@ -130,6 +136,8 @@ Example for an AM33xx board:
 		#address-cells = <2>;
 		#size-cells = <1>;
 		ranges = <0 0 0x08000000 0x10000000>; /* CS0 @addr 0x8000000, size 0x10000000 */
+		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		/* child nodes go here */
 	};
diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index 90dfba5a8f55..e28d6bc2500a 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -22,6 +22,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -127,7 +128,6 @@
 #define GPMC_CONFIG_RDY_BSY	0x00000001
 #define GPMC_CONFIG_DEV_SIZE	0x00000002
 #define GPMC_CONFIG_DEV_TYPE	0x00000003
-#define GPMC_SET_IRQ_STATUS	0x00000004
 
 #define GPMC_CONFIG1_WRAPBURST_SUPP     (1 << 31)
 #define GPMC_CONFIG1_READMULTIPLE_SUPP  (1 << 30)
@@ -176,8 +176,6 @@
 #define GPMC_CONFIG_WRITEPROTECT	0x00000010
 #define WR_RD_PIN_MONITORING		0x00600000
 
-#define GPMC_ENABLE_IRQ		0x0000000d
-
 /* ECC commands */
 #define GPMC_ECC_READ		0 /* Reset Hardware ECC for read */
 #define GPMC_ECC_WRITE		1 /* Reset Hardware ECC for write */
@@ -201,11 +199,6 @@ struct gpmc_cs_data {
 	struct resource mem;
 };
 
-struct gpmc_client_irq	{
-	unsigned		irq;
-	u32			bitmask;
-};
-
 /* Structure to save gpmc cs context */
 struct gpmc_cs_config {
 	u32 config1;
@@ -233,9 +226,13 @@ struct omap3_gpmc_regs {
 	struct gpmc_cs_config cs_context[GPMC_CS_NUM];
 };
 
-static struct gpmc_client_irq gpmc_client_irq[GPMC_NR_IRQ];
-static struct irq_chip gpmc_irq_chip;
-static int gpmc_irq_start;
+struct gpmc_device {
+	struct device *dev;
+	int irq;
+	struct irq_chip irq_chip;
+};
+
+static struct irq_domain *gpmc_irq_domain;
 
 static struct resource	gpmc_mem_root;
 static struct gpmc_cs_data gpmc_cs[GPMC_CS_NUM];
@@ -243,8 +240,6 @@ static DEFINE_SPINLOCK(gpmc_mem_lock);
 /* Define chip-selects as reserved by default until probe completes */
 static unsigned int gpmc_cs_num = GPMC_CS_NUM;
 static unsigned int gpmc_nr_waitpins;
-static struct device *gpmc_dev;
-static int gpmc_irq;
 static resource_size_t phys_base, mem_size;
 static unsigned gpmc_capability;
 static void __iomem *gpmc_base;
@@ -1056,14 +1051,6 @@ int gpmc_configure(int cmd, int wval)
 	u32 regval;
 
 	switch (cmd) {
-	case GPMC_ENABLE_IRQ:
-		gpmc_write_reg(GPMC_IRQENABLE, wval);
-		break;
-
-	case GPMC_SET_IRQ_STATUS:
-		gpmc_write_reg(GPMC_IRQSTATUS, wval);
-		break;
-
 	case GPMC_CONFIG_WP:
 		regval = gpmc_read_reg(GPMC_CONFIG);
 		if (wval)
@@ -1153,85 +1140,97 @@ EXPORT_SYMBOL_GPL(gpmc_omap_get_nand_ops);
 
 int gpmc_get_client_irq(unsigned irq_config)
 {
-	int i;
-
-	if (hweight32(irq_config) > 1)
+	if (!gpmc_irq_domain) {
+		pr_warn("%s called before GPMC IRQ domain available\n",
+			__func__);
 		return 0;
+	}
 
-	for (i = 0; i < GPMC_NR_IRQ; i++)
-		if (gpmc_client_irq[i].bitmask & irq_config)
-			return gpmc_client_irq[i].irq;
+	if (irq_config >= GPMC_NR_IRQ)
+		return 0;
 
-	return 0;
+	return irq_create_mapping(gpmc_irq_domain, irq_config);
 }
 
-static int gpmc_irq_endis(unsigned irq, bool endis)
+static int gpmc_irq_endis(unsigned long hwirq, bool endis)
 {
-	int i;
 	u32 regval;
 
-	for (i = 0; i < GPMC_NR_IRQ; i++)
-		if (irq == gpmc_client_irq[i].irq) {
-			regval = gpmc_read_reg(GPMC_IRQENABLE);
-			if (endis)
-				regval |= gpmc_client_irq[i].bitmask;
-			else
-				regval &= ~gpmc_client_irq[i].bitmask;
-			gpmc_write_reg(GPMC_IRQENABLE, regval);
-			break;
-		}
+	regval = gpmc_read_reg(GPMC_IRQENABLE);
+	if (endis)
+		regval |= BIT(hwirq);
+	else
+		regval &= ~BIT(hwirq);
+	gpmc_write_reg(GPMC_IRQENABLE, regval);
 
 	return 0;
 }
 
 static void gpmc_irq_disable(struct irq_data *p)
 {
-	gpmc_irq_endis(p->irq, false);
+	gpmc_irq_endis(p->hwirq, false);
 }
 
 static void gpmc_irq_enable(struct irq_data *p)
 {
-	gpmc_irq_endis(p->irq, true);
+	gpmc_irq_endis(p->hwirq, true);
 }
 
 static void gpmc_irq_noop(struct irq_data *data) { }
 
 static unsigned int gpmc_irq_noop_ret(struct irq_data *data) { return 0; }
 
-static int gpmc_setup_irq(void)
+static int gpmc_irq_map(struct irq_domain *d, unsigned int virq,
+			irq_hw_number_t hw)
 {
-	int i;
+	struct gpmc_device *gpmc = d->host_data;
+
+	irq_set_chip_data(virq, gpmc);
+	irq_set_chip_and_handler(virq, &gpmc->irq_chip, handle_simple_irq);
+	irq_modify_status(virq, IRQ_NOREQUEST, IRQ_NOAUTOEN);
+
+	return 0;
+}
+
+static const struct irq_domain_ops gpmc_irq_domain_ops = {
+	.map    = gpmc_irq_map,
+	.xlate  = irq_domain_xlate_twocell,
+};
+
+static irqreturn_t gpmc_handle_irq(int irq, void *data)
+{
+	int hwirq, virq;
 	u32 regval;
+	struct gpmc_device *gpmc = data;
 
-	if (!gpmc_irq)
-		return -EINVAL;
+	regval = gpmc_read_reg(GPMC_IRQSTATUS);
 
-	gpmc_irq_start = irq_alloc_descs(-1, 0, GPMC_NR_IRQ, 0);
-	if (gpmc_irq_start < 0) {
-		pr_err("irq_alloc_descs failed\n");
-		return gpmc_irq_start;
-	}
+	if (!regval)
+		return IRQ_NONE;
 
-	gpmc_irq_chip.name = "gpmc";
-	gpmc_irq_chip.irq_startup = gpmc_irq_noop_ret;
-	gpmc_irq_chip.irq_enable = gpmc_irq_enable;
-	gpmc_irq_chip.irq_disable = gpmc_irq_disable;
-	gpmc_irq_chip.irq_shutdown = gpmc_irq_noop;
-	gpmc_irq_chip.irq_ack = gpmc_irq_noop;
-	gpmc_irq_chip.irq_mask = gpmc_irq_noop;
-	gpmc_irq_chip.irq_unmask = gpmc_irq_noop;
-
-	gpmc_client_irq[0].bitmask = GPMC_IRQ_FIFOEVENTENABLE;
-	gpmc_client_irq[1].bitmask = GPMC_IRQ_COUNT_EVENT;
-
-	for (i = 0; i < GPMC_NR_IRQ; i++) {
-		gpmc_client_irq[i].irq = gpmc_irq_start + i;
-		irq_set_chip_and_handler(gpmc_client_irq[i].irq,
-					&gpmc_irq_chip, handle_simple_irq);
-		irq_modify_status(gpmc_client_irq[i].irq, IRQ_NOREQUEST,
-				  IRQ_NOAUTOEN);
+	for (hwirq = 0; hwirq < GPMC_NR_IRQ; hwirq++) {
+		if (regval & BIT(hwirq)) {
+			virq = irq_find_mapping(gpmc_irq_domain, hwirq);
+			if (!virq) {
+				dev_warn(gpmc->dev,
+					 "spurious irq detected hwirq %d, virq %d\n",
+					 hwirq, virq);
+			}
+
+			generic_handle_irq(virq);
+		}
 	}
 
+	gpmc_write_reg(GPMC_IRQSTATUS, regval);
+
+	return IRQ_HANDLED;
+}
+
+static int gpmc_setup_irq(struct gpmc_device *gpmc)
+{
+	u32 regval;
+	int rc;
+
 	/* Disable interrupts */
 	gpmc_write_reg(GPMC_IRQENABLE, 0);
 
@@ -1239,22 +1238,46 @@ static int gpmc_setup_irq(void)
 	regval = gpmc_read_reg(GPMC_IRQSTATUS);
 	gpmc_write_reg(GPMC_IRQSTATUS, regval);
 
-	return request_irq(gpmc_irq, gpmc_handle_irq, 0, "gpmc", NULL);
+	gpmc->irq_chip.name = "gpmc";
+	gpmc->irq_chip.irq_startup = gpmc_irq_noop_ret;
+	gpmc->irq_chip.irq_enable = gpmc_irq_enable;
+	gpmc->irq_chip.irq_disable = gpmc_irq_disable;
+	gpmc->irq_chip.irq_shutdown = gpmc_irq_noop;
+	gpmc->irq_chip.irq_ack = gpmc_irq_noop;
+	gpmc->irq_chip.irq_mask = gpmc_irq_noop;
+	gpmc->irq_chip.irq_unmask = gpmc_irq_noop;
+
+	gpmc_irq_domain = irq_domain_add_linear(gpmc->dev->of_node,
+						GPMC_NR_IRQ,
+						&gpmc_irq_domain_ops,
+						gpmc);
+	if (!gpmc_irq_domain) {
+		dev_err(gpmc->dev, "IRQ domain add failed\n");
+		return -ENODEV;
+	}
+
+	rc = request_irq(gpmc->irq, gpmc_handle_irq, 0, "gpmc", gpmc);
+	if (rc) {
+		dev_err(gpmc->dev, "failed to request irq %d: %d\n",
+			gpmc->irq, rc);
+		irq_domain_remove(gpmc_irq_domain);
+		gpmc_irq_domain = NULL;
+	}
+
+	return rc;
 }
 
-static int gpmc_free_irq(void)
+static int gpmc_free_irq(struct gpmc_device *gpmc)
 {
-	int i;
+	int hwirq;
 
-	if (gpmc_irq)
-		free_irq(gpmc_irq, NULL);
+	free_irq(gpmc->irq, gpmc);
 
-	for (i = 0; i < GPMC_NR_IRQ; i++) {
-		irq_set_handler(gpmc_client_irq[i].irq, NULL);
-		irq_set_chip(gpmc_client_irq[i].irq, &no_irq_chip);
-	}
+	for (hwirq = 0; hwirq < GPMC_NR_IRQ; hwirq++)
+		irq_dispose_mapping(irq_find_mapping(gpmc_irq_domain, hwirq));
 
-	irq_free_descs(gpmc_irq_start, GPMC_NR_IRQ);
+	irq_domain_remove(gpmc_irq_domain);
+	gpmc_irq_domain = NULL;
 
 	return 0;
 }
@@ -2154,6 +2177,14 @@ static int gpmc_probe(struct platform_device *pdev)
 	int rc;
 	u32 l;
 	struct resource *res;
+	struct gpmc_device *gpmc;
+
+	gpmc = devm_kzalloc(&pdev->dev, sizeof(*gpmc), GFP_KERNEL);
+	if (!gpmc)
+		return -ENOMEM;
+
+	gpmc->dev = &pdev->dev;
+	platform_set_drvdata(pdev, gpmc);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL)
@@ -2167,15 +2198,16 @@ static int gpmc_probe(struct platform_device *pdev)
 		return PTR_ERR(gpmc_base);
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (res == NULL)
-		dev_warn(&pdev->dev, "Failed to get resource: irq\n");
-	else
-		gpmc_irq = res->start;
+	if (!res) {
+		dev_err(&pdev->dev, "Failed to get resource: irq\n");
+		return -ENOENT;
+	}
+
+	gpmc->irq = res->start;
 
 	gpmc_l3_clk = devm_clk_get(&pdev->dev, "fck");
 	if (IS_ERR(gpmc_l3_clk)) {
 		dev_err(&pdev->dev, "Failed to get GPMC fck\n");
-		gpmc_irq = 0;
 		return PTR_ERR(gpmc_l3_clk);
 	}
 
@@ -2187,8 +2219,6 @@ static int gpmc_probe(struct platform_device *pdev)
 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_get_sync(&pdev->dev);
 
-	gpmc_dev = &pdev->dev;
-
 	l = gpmc_read_reg(GPMC_REVISION);
 
 	/*
@@ -2207,13 +2237,16 @@ static int gpmc_probe(struct platform_device *pdev)
 		gpmc_capability = GPMC_HAS_WR_ACCESS | GPMC_HAS_WR_DATA_MUX_BUS;
 	if (GPMC_REVISION_MAJOR(l) > 0x5)
 		gpmc_capability |= GPMC_HAS_MUX_AAD;
-	dev_info(gpmc_dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
+	dev_info(gpmc->dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
 		 GPMC_REVISION_MINOR(l));
 
 	gpmc_mem_init();
 
-	if (gpmc_setup_irq() < 0)
-		dev_warn(gpmc_dev, "gpmc_setup_irq failed\n");
+	rc = gpmc_setup_irq(gpmc);
+	if (rc) {
+		dev_err(gpmc->dev, "gpmc_setup_irq failed\n");
+		goto fail;
+	}
 
 	if (!pdev->dev.of_node) {
 		gpmc_cs_num	 = GPMC_CS_NUM;
@@ -2222,21 +2255,27 @@ static int gpmc_probe(struct platform_device *pdev)
 
 	rc = gpmc_probe_dt(pdev);
 	if (rc < 0) {
-		pm_runtime_put_sync(&pdev->dev);
-		dev_err(gpmc_dev, "failed to probe DT parameters\n");
-		return rc;
+		dev_err(gpmc->dev, "failed to probe DT parameters\n");
+		gpmc_free_irq(gpmc);
+		goto fail;
 	}
 
 	return 0;
+
+fail:
+	pm_runtime_put_sync(&pdev->dev);
+	return rc;
 }
 
 static int gpmc_remove(struct platform_device *pdev)
 {
-	gpmc_free_irq();
+	struct gpmc_device *gpmc = platform_get_drvdata(pdev);
+
+	gpmc_free_irq(gpmc);
 	gpmc_mem_exit();
 	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
-	gpmc_dev = NULL;
+
 	return 0;
 }
 
@@ -2282,25 +2321,6 @@ static __exit void gpmc_exit(void)
 postcore_initcall(gpmc_init);
 module_exit(gpmc_exit);
 
-static irqreturn_t gpmc_handle_irq(int irq, void *dev)
-{
-	int i;
-	u32 regval;
-
-	regval = gpmc_read_reg(GPMC_IRQSTATUS);
-
-	if (!regval)
-		return IRQ_NONE;
-
-	for (i = 0; i < GPMC_NR_IRQ; i++)
-		if (regval & gpmc_client_irq[i].bitmask)
-			generic_handle_irq(gpmc_client_irq[i].irq);
-
-	gpmc_write_reg(GPMC_IRQSTATUS, regval);
-
-	return IRQ_HANDLED;
-}
-
 static struct omap3_gpmc_regs gpmc_context;
 
 void omap3_gpmc_save_context(void)
diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index dc2ada6fb9b4..9e9d79e8efa5 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -11,8 +11,9 @@
 
 #define GPMC_CONFIG_WP		0x00000005
 
-#define GPMC_IRQ_FIFOEVENTENABLE	0x01
-#define GPMC_IRQ_COUNT_EVENT		0x02
+/* IRQ numbers in GPMC IRQ domain for legacy boot use */
+#define GPMC_IRQ_FIFOEVENTENABLE	0
+#define GPMC_IRQ_COUNT_EVENT		1
 
 /**
  * gpmc_nand_ops - Interface between NAND and GPMC
-- 
cgit v1.2.3


From c509aefd75d026f4ef4aa306131d7a780c2eda7b Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 5 Aug 2015 14:01:50 +0300
Subject: mtd: nand: omap: Use gpmc_omap_get_nand_ops() to get NAND registers

Deprecate nand register passing via platform data and use
gpmc_omap_get_nand_ops() instead.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/gpmc-nand.c              | 2 --
 drivers/mtd/nand/omap2.c                     | 9 ++++++++-
 include/linux/platform_data/mtd-nand-omap2.h | 4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/gpmc-nand.c b/arch/arm/mach-omap2/gpmc-nand.c
index 72918c4973ea..04e6998c1529 100644
--- a/arch/arm/mach-omap2/gpmc-nand.c
+++ b/arch/arm/mach-omap2/gpmc-nand.c
@@ -121,8 +121,6 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
 	if (err < 0)
 		goto out_free_cs;
 
-	gpmc_update_nand_reg(&gpmc_nand_data->reg, gpmc_nand_data->cs);
-
 	if (!gpmc_hwecc_bch_capable(gpmc_nand_data->ecc_opt)) {
 		pr_err("omap2-nand: Unsupported NAND ECC scheme selected\n");
 		err = -EINVAL;
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 0749ca1a1456..cba9bf0adba1 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -28,6 +28,7 @@
 #include <linux/mtd/nand_bch.h>
 #include <linux/platform_data/elm.h>
 
+#include <linux/omap-gpmc.h>
 #include <linux/platform_data/mtd-nand-omap2.h>
 
 #define	DRIVER_NAME	"omap2-nand"
@@ -168,7 +169,9 @@ struct omap_nand_info {
 	} iomode;
 	u_char				*buf;
 	int					buf_len;
+	/* Interface to GPMC */
 	struct gpmc_nand_regs		reg;
+	struct gpmc_nand_ops		*ops;
 	/* generated at runtime depending on ECC algorithm and layout selected */
 	struct nand_ecclayout		oobinfo;
 	/* fields specific for BCHx_HW ECC scheme */
@@ -1665,9 +1668,13 @@ static int omap_nand_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, info);
 
+	info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
+	if (!info->ops) {
+		dev_err(&pdev->dev, "Failed to get GPMC->NAND interface\n");
+		return -ENODEV;
+	}
 	info->pdev		= pdev;
 	info->gpmc_cs		= pdata->cs;
-	info->reg		= pdata->reg;
 	info->of_node		= pdata->of_node;
 	info->ecc_opt		= pdata->ecc_opt;
 	nand_chip		= &info->nand;
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 090bbab0130a..a067f581e938 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -75,10 +75,12 @@ struct omap_nand_platform_data {
 	enum nand_io		xfer_type;
 	int			devsize;
 	enum omap_ecc           ecc_opt;
-	struct gpmc_nand_regs	reg;
 
 	/* for passing the partitions */
 	struct device_node	*of_node;
 	struct device_node	*elm_of_node;
+
+	/* deprecated */
+	struct gpmc_nand_regs	reg;
 };
 #endif
-- 
cgit v1.2.3


From c9711ec5250b22fd94e9b34c17c095e001a90e66 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 21 May 2014 07:29:03 +0300
Subject: mtd: nand: omap: Clean up device tree support

Move NAND specific device tree parsing to NAND driver.

The NAND controller node must have a compatible id, register space
resource and interrupt resource.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/gpmc-nand.c              |   5 +-
 drivers/memory/omap-gpmc.c                   | 143 +++++++--------------------
 drivers/mtd/nand/omap2.c                     | 134 +++++++++++++++++++++----
 include/linux/platform_data/mtd-nand-omap2.h |   3 +-
 4 files changed, 153 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/gpmc-nand.c b/arch/arm/mach-omap2/gpmc-nand.c
index 04e6998c1529..f6ac027f3c3b 100644
--- a/arch/arm/mach-omap2/gpmc-nand.c
+++ b/arch/arm/mach-omap2/gpmc-nand.c
@@ -97,10 +97,7 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
 	gpmc_nand_res[2].start = gpmc_get_client_irq(GPMC_IRQ_COUNT_EVENT);
 
 	memset(&s, 0, sizeof(struct gpmc_settings));
-	if (gpmc_nand_data->of_node)
-		gpmc_read_settings_dt(gpmc_nand_data->of_node, &s);
-	else
-		gpmc_set_legacy(gpmc_nand_data, &s);
+	gpmc_set_legacy(gpmc_nand_data, &s);
 
 	s.device_nand = true;
 
diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index e28d6bc2500a..8dc6e3b1c44a 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -30,7 +30,6 @@
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/omap-gpmc.h>
-#include <linux/mtd/nand.h>
 #include <linux/pm_runtime.h>
 
 #include <linux/platform_data/mtd-nand-omap2.h>
@@ -1852,105 +1851,6 @@ static void __maybe_unused gpmc_read_timings_dt(struct device_node *np,
 		of_property_read_bool(np, "gpmc,time-para-granularity");
 }
 
-#if IS_ENABLED(CONFIG_MTD_NAND)
-
-static const char * const nand_xfer_types[] = {
-	[NAND_OMAP_PREFETCH_POLLED]		= "prefetch-polled",
-	[NAND_OMAP_POLLED]			= "polled",
-	[NAND_OMAP_PREFETCH_DMA]		= "prefetch-dma",
-	[NAND_OMAP_PREFETCH_IRQ]		= "prefetch-irq",
-};
-
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-				 struct device_node *child)
-{
-	u32 val;
-	const char *s;
-	struct gpmc_timings gpmc_t;
-	struct omap_nand_platform_data *gpmc_nand_data;
-
-	if (of_property_read_u32(child, "reg", &val) < 0) {
-		dev_err(&pdev->dev, "%s has no 'reg' property\n",
-			child->full_name);
-		return -ENODEV;
-	}
-
-	gpmc_nand_data = devm_kzalloc(&pdev->dev, sizeof(*gpmc_nand_data),
-				      GFP_KERNEL);
-	if (!gpmc_nand_data)
-		return -ENOMEM;
-
-	gpmc_nand_data->cs = val;
-	gpmc_nand_data->of_node = child;
-
-	/* Detect availability of ELM module */
-	gpmc_nand_data->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
-	if (gpmc_nand_data->elm_of_node == NULL)
-		gpmc_nand_data->elm_of_node =
-					of_parse_phandle(child, "elm_id", 0);
-
-	/* select ecc-scheme for NAND */
-	if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
-		pr_err("%s: ti,nand-ecc-opt not found\n", __func__);
-		return -ENODEV;
-	}
-
-	if (!strcmp(s, "sw"))
-		gpmc_nand_data->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
-	else if (!strcmp(s, "ham1") ||
-		 !strcmp(s, "hw") || !strcmp(s, "hw-romcode"))
-		gpmc_nand_data->ecc_opt =
-				OMAP_ECC_HAM1_CODE_HW;
-	else if (!strcmp(s, "bch4"))
-		if (gpmc_nand_data->elm_of_node)
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH4_CODE_HW;
-		else
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
-	else if (!strcmp(s, "bch8"))
-		if (gpmc_nand_data->elm_of_node)
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH8_CODE_HW;
-		else
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
-	else if (!strcmp(s, "bch16"))
-		if (gpmc_nand_data->elm_of_node)
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH16_CODE_HW;
-		else
-			pr_err("%s: BCH16 requires ELM support\n", __func__);
-	else
-		pr_err("%s: ti,nand-ecc-opt invalid value\n", __func__);
-
-	/* select data transfer mode for NAND controller */
-	if (!of_property_read_string(child, "ti,nand-xfer-type", &s))
-		for (val = 0; val < ARRAY_SIZE(nand_xfer_types); val++)
-			if (!strcasecmp(s, nand_xfer_types[val])) {
-				gpmc_nand_data->xfer_type = val;
-				break;
-			}
-
-	gpmc_nand_data->flash_bbt = of_get_nand_on_flash_bbt(child);
-
-	val = of_get_nand_bus_width(child);
-	if (val == 16)
-		gpmc_nand_data->devsize = NAND_BUSWIDTH_16;
-
-	gpmc_read_timings_dt(child, &gpmc_t);
-	gpmc_nand_init(gpmc_nand_data, &gpmc_t);
-
-	return 0;
-}
-#else
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-				 struct device_node *child)
-{
-	return 0;
-}
-#endif
-
 #if IS_ENABLED(CONFIG_MTD_ONENAND)
 static int gpmc_probe_onenand_child(struct platform_device *pdev,
 				 struct device_node *child)
@@ -2069,9 +1969,42 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
 		goto err;
 	}
 
-	ret = of_property_read_u32(child, "bank-width", &gpmc_s.device_width);
-	if (ret < 0)
-		goto err;
+	if (of_node_cmp(child->name, "nand") == 0) {
+		/* Warn about older DT blobs with no compatible property */
+		if (!of_property_read_bool(child, "compatible")) {
+			dev_warn(&pdev->dev,
+				 "Incompatible NAND node: missing compatible");
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	if (of_device_is_compatible(child, "ti,omap2-nand")) {
+		/* NAND specific setup */
+		val = of_get_nand_bus_width(child);
+		switch (val) {
+		case 8:
+			gpmc_s.device_width = GPMC_DEVWIDTH_8BIT;
+			break;
+		case 16:
+			gpmc_s.device_width = GPMC_DEVWIDTH_16BIT;
+			break;
+		default:
+			dev_err(&pdev->dev, "%s: invalid 'nand-bus-width'\n",
+				child->name);
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/* disable write protect */
+		gpmc_configure(GPMC_CONFIG_WP, 0);
+		gpmc_s.device_nand = true;
+	} else {
+		ret = of_property_read_u32(child, "bank-width",
+					   &gpmc_s.device_width);
+		if (ret < 0)
+			goto err;
+	}
 
 	gpmc_cs_show_timings(cs, "before gpmc_cs_program_settings");
 	ret = gpmc_cs_program_settings(cs, &gpmc_s);
@@ -2155,9 +2088,7 @@ static int gpmc_probe_dt(struct platform_device *pdev)
 		if (!child->name)
 			continue;
 
-		if (of_node_cmp(child->name, "nand") == 0)
-			ret = gpmc_probe_nand_child(pdev, child);
-		else if (of_node_cmp(child->name, "onenand") == 0)
+		if (of_node_cmp(child->name, "onenand") == 0)
 			ret = gpmc_probe_onenand_child(pdev, child);
 		else
 			ret = gpmc_probe_generic_child(pdev, child);
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 7e4e263c7d9c..35b8f3359c17 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/of_mtd.h>
 
 #include <linux/mtd/nand_bch.h>
 #include <linux/platform_data/elm.h>
@@ -176,11 +177,11 @@ struct omap_nand_info {
 	/* Interface to GPMC */
 	struct gpmc_nand_regs		reg;
 	struct gpmc_nand_ops		*ops;
+	bool				flash_bbt;
 	/* generated at runtime depending on ECC algorithm and layout selected */
 	struct nand_ecclayout		oobinfo;
 	/* fields specific for BCHx_HW ECC scheme */
 	struct device			*elm_dev;
-	struct device_node		*of_node;
 };
 
 static inline struct omap_nand_info *mtd_to_omap(struct mtd_info *mtd)
@@ -1643,10 +1644,86 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info,
 	return true;
 }
 
+static const char * const nand_xfer_types[] = {
+	[NAND_OMAP_PREFETCH_POLLED] = "prefetch-polled",
+	[NAND_OMAP_POLLED] = "polled",
+	[NAND_OMAP_PREFETCH_DMA] = "prefetch-dma",
+	[NAND_OMAP_PREFETCH_IRQ] = "prefetch-irq",
+};
+
+static int omap_get_dt_info(struct device *dev, struct omap_nand_info *info)
+{
+	struct device_node *child = dev->of_node;
+	int i;
+	const char *s;
+	u32 cs;
+
+	if (of_property_read_u32(child, "reg", &cs) < 0) {
+		dev_err(dev, "reg not found in DT\n");
+		return -EINVAL;
+	}
+
+	info->gpmc_cs = cs;
+
+	/* detect availability of ELM module. Won't be present pre-OMAP4 */
+	info->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
+	if (!info->elm_of_node)
+		dev_dbg(dev, "ti,elm-id not in DT\n");
+
+	/* select ecc-scheme for NAND */
+	if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
+		dev_err(dev, "ti,nand-ecc-opt not found\n");
+		return -EINVAL;
+	}
+
+	if (!strcmp(s, "sw")) {
+		info->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
+	} else if (!strcmp(s, "ham1") ||
+		   !strcmp(s, "hw") || !strcmp(s, "hw-romcode")) {
+		info->ecc_opt =	OMAP_ECC_HAM1_CODE_HW;
+	} else if (!strcmp(s, "bch4")) {
+		if (info->elm_of_node)
+			info->ecc_opt = OMAP_ECC_BCH4_CODE_HW;
+		else
+			info->ecc_opt = OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
+	} else if (!strcmp(s, "bch8")) {
+		if (info->elm_of_node)
+			info->ecc_opt = OMAP_ECC_BCH8_CODE_HW;
+		else
+			info->ecc_opt = OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
+	} else if (!strcmp(s, "bch16")) {
+		info->ecc_opt =	OMAP_ECC_BCH16_CODE_HW;
+	} else {
+		dev_err(dev, "unrecognized value for ti,nand-ecc-opt\n");
+		return -EINVAL;
+	}
+
+	/* select data transfer mode */
+	if (!of_property_read_string(child, "ti,nand-xfer-type", &s)) {
+		for (i = 0; i < ARRAY_SIZE(nand_xfer_types); i++) {
+			if (!strcasecmp(s, nand_xfer_types[i])) {
+				info->xfer_type = i;
+				goto next;
+			}
+		}
+
+		dev_err(dev, "unrecognized value for ti,nand-xfer-type\n");
+		return -EINVAL;
+	}
+
+next:
+	of_get_nand_on_flash_bbt(child);
+
+	if (of_get_nand_bus_width(child) == 16)
+		info->devsize = NAND_BUSWIDTH_16;
+
+	return 0;
+}
+
 static int omap_nand_probe(struct platform_device *pdev)
 {
 	struct omap_nand_info		*info;
-	struct omap_nand_platform_data	*pdata;
+	struct omap_nand_platform_data	*pdata = NULL;
 	struct mtd_info			*mtd;
 	struct nand_chip		*nand_chip;
 	struct nand_ecclayout		*ecclayout;
@@ -1656,39 +1733,47 @@ static int omap_nand_probe(struct platform_device *pdev)
 	unsigned			sig;
 	unsigned			oob_index;
 	struct resource			*res;
-
-	pdata = dev_get_platdata(&pdev->dev);
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "platform data missing\n");
-		return -ENODEV;
-	}
+	struct device			*dev = &pdev->dev;
 
 	info = devm_kzalloc(&pdev->dev, sizeof(struct omap_nand_info),
 				GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
-	platform_set_drvdata(pdev, info);
+	info->pdev = pdev;
 
+	if (dev->of_node) {
+		if (omap_get_dt_info(dev, info))
+			return -EINVAL;
+	} else {
+		pdata = dev_get_platdata(&pdev->dev);
+		if (!pdata) {
+			dev_err(&pdev->dev, "platform data missing\n");
+			return -EINVAL;
+		}
+
+		info->gpmc_cs = pdata->cs;
+		info->reg = pdata->reg;
+		info->ecc_opt = pdata->ecc_opt;
+		info->dev_ready	= pdata->dev_ready;
+		info->xfer_type = pdata->xfer_type;
+		info->devsize = pdata->devsize;
+		info->elm_of_node = pdata->elm_of_node;
+		info->flash_bbt = pdata->flash_bbt;
+	}
+
+	platform_set_drvdata(pdev, info);
 	info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
 	if (!info->ops) {
 		dev_err(&pdev->dev, "Failed to get GPMC->NAND interface\n");
 		return -ENODEV;
 	}
-	info->pdev		= pdev;
-	info->gpmc_cs		= pdata->cs;
-	info->of_node		= pdata->of_node;
-	info->ecc_opt		= pdata->ecc_opt;
-	info->dev_ready	= pdata->dev_ready;
-	info->xfer_type = pdata->xfer_type;
-	info->devsize = pdata->devsize;
-	info->elm_of_node = pdata->elm_of_node;
 
 	nand_chip		= &info->nand;
 	mtd			= nand_to_mtd(nand_chip);
 	mtd->dev.parent		= &pdev->dev;
 	nand_chip->ecc.priv	= NULL;
-	nand_set_flash_node(nand_chip, pdata->of_node);
+	nand_set_flash_node(nand_chip, dev->of_node);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	nand_chip->IO_ADDR_R = devm_ioremap_resource(&pdev->dev, res);
@@ -1717,7 +1802,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 		nand_chip->chip_delay = 50;
 	}
 
-	if (pdata->flash_bbt)
+	if (info->flash_bbt)
 		nand_chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
 	else
 		nand_chip->options |= NAND_SKIP_BBTSCAN;
@@ -2035,7 +2120,10 @@ scan_tail:
 		goto return_error;
 	}
 
-	mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
+	if (dev->of_node)
+		mtd_device_register(mtd, NULL, 0);
+	else
+		mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
 
 	platform_set_drvdata(pdev, mtd);
 
@@ -2066,11 +2154,17 @@ static int omap_nand_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id omap_nand_ids[] = {
+	{ .compatible = "ti,omap2-nand", },
+	{},
+};
+
 static struct platform_driver omap_nand_driver = {
 	.probe		= omap_nand_probe,
 	.remove		= omap_nand_remove,
 	.driver		= {
 		.name	= DRIVER_NAME,
+		.of_match_table = of_match_ptr(omap_nand_ids),
 	},
 };
 
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index a067f581e938..ff27e5a77e03 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -76,11 +76,10 @@ struct omap_nand_platform_data {
 	int			devsize;
 	enum omap_ecc           ecc_opt;
 
-	/* for passing the partitions */
-	struct device_node	*of_node;
 	struct device_node	*elm_of_node;
 
 	/* deprecated */
 	struct gpmc_nand_regs	reg;
+	struct device_node	*of_node;
 };
 #endif
-- 
cgit v1.2.3


From 9e6946215dbd9803e8b511928c9f61f3a49e2c58 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 7 Aug 2015 10:38:13 +0300
Subject: memory: omap-gpmc: Prevent GPMC_STATUS from being accessed via
 gpmc_regs

GPMC_STATUS register is private to the GPMC module and must not be
accessed directly by NAND driver through the gpmc_regs.

They must use gpmc_omap_get_nand_ops() instead.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 drivers/memory/omap-gpmc.c                   | 2 +-
 include/linux/platform_data/mtd-nand-omap2.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index ea9c89747950..33d69b1e4c31 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -1081,7 +1081,7 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 {
 	int i;
 
-	reg->gpmc_status = gpmc_base + GPMC_STATUS;
+	reg->gpmc_status = NULL;	/* deprecated */
 	reg->gpmc_nand_command = gpmc_base + GPMC_CS0_OFFSET +
 				GPMC_CS_NAND_COMMAND + GPMC_CS_SIZE * cs;
 	reg->gpmc_nand_address = gpmc_base + GPMC_CS0_OFFSET +
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index ff27e5a77e03..7f6de5377f80 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -45,7 +45,6 @@ enum omap_ecc {
 };
 
 struct gpmc_nand_regs {
-	void __iomem	*gpmc_status;
 	void __iomem	*gpmc_nand_command;
 	void __iomem	*gpmc_nand_address;
 	void __iomem	*gpmc_nand_data;
@@ -64,6 +63,8 @@ struct gpmc_nand_regs {
 	void __iomem	*gpmc_bch_result4[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
+	/* Deprecated. Do not use */
+	void __iomem	*gpmc_status;
 };
 
 struct omap_nand_platform_data {
-- 
cgit v1.2.3


From 10f22ee367c4aff7841da6a83c10445d7d6328d9 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Thu, 6 Aug 2015 17:39:35 +0300
Subject: mtd: nand: omap2: Implement NAND ready using gpiolib

The GPMC WAIT pin status are now available over gpiolib.
Update the omap_dev_ready() function to use gpio instead of
directly accessing GPMC register space.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 .../devicetree/bindings/mtd/gpmc-nand.txt          |  2 ++
 drivers/mtd/nand/omap2.c                           | 29 ++++++++++++++--------
 include/linux/platform_data/mtd-nand-omap2.h       |  2 +-
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
index ff3215d20343..3ee7e202657c 100644
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -48,6 +48,7 @@ Optional properties:
 		locating ECC errors for BCHx algorithms. SoC devices which have
 		ELM hardware engines should specify this device node in .dtsi
 		Using ELM for ECC error correction frees some CPU cycles.
+ - rb-gpios:	GPIO specifier for the ready/busy# pin.
 
 For inline partition table parsing (optional):
 
@@ -78,6 +79,7 @@ Example for an AM33xx board:
 			nand-bus-width = <16>;
 			ti,nand-ecc-opt = "bch8";
 			ti,nand-xfer-type = "polled";
+			rb-gpios = <&gpmc 0 GPIO_ACTIVE_HIGH>; /* gpmc_wait0 */
 
 			gpmc,sync-clk-ps = <0>;
 			gpmc,cs-on-ns = <0>;
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 35b8f3359c17..e0b2b2f0fbde 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -12,6 +12,7 @@
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -182,6 +183,8 @@ struct omap_nand_info {
 	struct nand_ecclayout		oobinfo;
 	/* fields specific for BCHx_HW ECC scheme */
 	struct device			*elm_dev;
+	/* NAND ready gpio */
+	struct gpio_desc		*ready_gpiod;
 };
 
 static inline struct omap_nand_info *mtd_to_omap(struct mtd_info *mtd)
@@ -1023,21 +1026,16 @@ static int omap_wait(struct mtd_info *mtd, struct nand_chip *chip)
 }
 
 /**
- * omap_dev_ready - calls the platform specific dev_ready function
+ * omap_dev_ready - checks the NAND Ready GPIO line
  * @mtd: MTD device structure
+ *
+ * Returns true if ready and false if busy.
  */
 static int omap_dev_ready(struct mtd_info *mtd)
 {
-	unsigned int val = 0;
 	struct omap_nand_info *info = mtd_to_omap(mtd);
 
-	val = readl(info->reg.gpmc_status);
-
-	if ((val & 0x100) == 0x100) {
-		return 1;
-	} else {
-		return 0;
-	}
+	return gpiod_get_value(info->ready_gpiod);
 }
 
 /**
@@ -1755,7 +1753,9 @@ static int omap_nand_probe(struct platform_device *pdev)
 		info->gpmc_cs = pdata->cs;
 		info->reg = pdata->reg;
 		info->ecc_opt = pdata->ecc_opt;
-		info->dev_ready	= pdata->dev_ready;
+		if (pdata->dev_ready)
+			dev_info(&pdev->dev, "pdata->dev_ready is deprecated\n");
+
 		info->xfer_type = pdata->xfer_type;
 		info->devsize = pdata->devsize;
 		info->elm_of_node = pdata->elm_of_node;
@@ -1787,6 +1787,13 @@ static int omap_nand_probe(struct platform_device *pdev)
 	nand_chip->IO_ADDR_W = nand_chip->IO_ADDR_R;
 	nand_chip->cmd_ctrl  = omap_hwcontrol;
 
+	info->ready_gpiod = devm_gpiod_get_optional(&pdev->dev, "rb",
+						    GPIOD_IN);
+	if (IS_ERR(info->ready_gpiod)) {
+		dev_err(dev, "failed to get ready gpio\n");
+		return PTR_ERR(info->ready_gpiod);
+	}
+
 	/*
 	 * If RDY/BSY line is connected to OMAP then use the omap ready
 	 * function and the generic nand_wait function which reads the status
@@ -1794,7 +1801,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 	 * chip delay which is slightly more than tR (AC Timing) of the NAND
 	 * device and read status register until you get a failure or success
 	 */
-	if (info->dev_ready) {
+	if (info->ready_gpiod) {
 		nand_chip->dev_ready = omap_dev_ready;
 		nand_chip->chip_delay = 0;
 	} else {
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 7f6de5377f80..17d57a18bac5 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -71,7 +71,6 @@ struct omap_nand_platform_data {
 	int			cs;
 	struct mtd_partition	*parts;
 	int			nr_parts;
-	bool			dev_ready;
 	bool			flash_bbt;
 	enum nand_io		xfer_type;
 	int			devsize;
@@ -82,5 +81,6 @@ struct omap_nand_platform_data {
 	/* deprecated */
 	struct gpmc_nand_regs	reg;
 	struct device_node	*of_node;
+	bool			dev_ready;
 };
 #endif
-- 
cgit v1.2.3


From 675f10bde6cc3874632a8f684df2a8a2a8ace76e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 22 Feb 2016 18:29:18 +0800
Subject: f2fs: fix to convert inline directory correctly

With below serials, we will lose parts of dirents:

1) mount f2fs with inline_dentry option
2) echo 1 > /sys/fs/f2fs/sdX/dir_level
3) mkdir dir
4) touch 180 files named [1-180] in dir
5) touch 181 in dir
6) echo 3 > /proc/sys/vm/drop_caches
7) ll dir

ls: cannot access 2: No such file or directory
ls: cannot access 4: No such file or directory
ls: cannot access 5: No such file or directory
ls: cannot access 6: No such file or directory
ls: cannot access 8: No such file or directory
ls: cannot access 9: No such file or directory
...
total 360
drwxr-xr-x 2 root root 4096 Feb 19 15:12 ./
drwxr-xr-x 3 root root 4096 Feb 19 15:11 ../
-rw-r--r-- 1 root root    0 Feb 19 15:12 1
-rw-r--r-- 1 root root    0 Feb 19 15:12 10
-rw-r--r-- 1 root root    0 Feb 19 15:12 100
-????????? ? ?    ?       ?            ? 101
-????????? ? ?    ?       ?            ? 102
-????????? ? ?    ?       ?            ? 103
...

The reason is: when doing the inline dir conversion, we didn't consider
that directory has hierarchical hash structure which can be configured
through sysfs interface 'dir_level'.

By default, dir_level of directory inode is 0, it means we have one bucket
in hash table located in first level, all dirents will be hashed in this
bucket, so it has no problem for us to do the duplication simply between
inline dentry page and converted normal dentry page.

However, if we configured dir_level with the value N (greater than 0), it
will expand the bucket number of first level hash table by 2^N - 1, it
hashs dirents into different buckets according their hash value, if we
still move all dirents to first bucket, it makes incorrent locating for
inline dirents, the result is, although we can iterate all dirents through
->readdir, we can't stat some of them in ->lookup which based on hash
table searching.

This patch fixes this issue by rehashing dirents into correct position
when converting inline directory.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c           | 87 ++++++++++++++++++++++++---------------------
 fs/f2fs/f2fs.h          |  4 ++-
 fs/f2fs/inline.c        | 94 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/f2fs_fs.h |  2 ++
 4 files changed, 144 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index af819571bce7..e90380d82214 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
 	[F2FS_FT_SYMLINK]	= DT_LNK,
 };
 
-#define S_SHIFT 12
 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR,
@@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
 	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+	if (de->file_type < F2FS_FT_MAX)
+		return f2fs_filetype_table[de->file_type];
+	return DT_UNKNOWN;
+}
+
 static unsigned long dir_block_index(unsigned int level,
 				int dir_level, unsigned int idx)
 {
@@ -509,11 +515,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
 	}
 }
 
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
 				struct inode *inode, nid_t ino, umode_t mode)
 {
 	unsigned int bit_pos;
@@ -526,28 +528,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
 	struct page *page = NULL;
-	struct fscrypt_name fname;
-	struct qstr new_name;
-	int slots, err;
-
-	err = fscrypt_setup_filename(dir, name, 0, &fname);
-	if (err)
-		return err;
-
-	new_name.name = fname_name(&fname);
-	new_name.len = fname_len(&fname);
-
-	if (f2fs_has_inline_dentry(dir)) {
-		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
-		if (!err || err != -EAGAIN)
-			goto out;
-		else
-			err = 0;
-	}
+	int slots, err = 0;
 
 	level = 0;
-	slots = GET_DENTRY_SLOTS(new_name.len);
-	dentry_hash = f2fs_dentry_hash(&new_name);
+	slots = GET_DENTRY_SLOTS(new_name->len);
+	dentry_hash = f2fs_dentry_hash(new_name);
 
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -556,10 +541,8 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	}
 
 start:
-	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
-		err = -ENOSPC;
-		goto out;
-	}
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+		return -ENOSPC;
 
 	/* Increase the depth, if required */
 	if (level == current_depth)
@@ -573,10 +556,8 @@ start:
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
 		dentry_page = get_new_data_page(dir, NULL, block, true);
-		if (IS_ERR(dentry_page)) {
-			err = PTR_ERR(dentry_page);
-			goto out;
-		}
+		if (IS_ERR(dentry_page))
+			return PTR_ERR(dentry_page);
 
 		dentry_blk = kmap(dentry_page);
 		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -596,7 +577,7 @@ add_dentry:
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
-		page = init_inode_metadata(inode, dir, &new_name, NULL);
+		page = init_inode_metadata(inode, dir, new_name, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto fail;
@@ -606,7 +587,7 @@ add_dentry:
 	}
 
 	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
-	f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+	f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
 
 	set_page_dirty(dentry_page);
 
@@ -628,7 +609,34 @@ fail:
 	}
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
-out:
+
+	return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+				struct inode *inode, nid_t ino, umode_t mode)
+{
+	struct fscrypt_name fname;
+	struct qstr new_name;
+	int err;
+
+	err = fscrypt_setup_filename(dir, name, 0, &fname);
+	if (err)
+		return err;
+
+	new_name.name = fname_name(&fname);
+	new_name.len = fname_len(&fname);
+
+	err = -EAGAIN;
+	if (f2fs_has_inline_dentry(dir))
+		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
+	if (err == -EAGAIN)
+		err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+
 	fscrypt_free_filename(&fname);
 	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
 	return err;
@@ -792,10 +800,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			continue;
 		}
 
-		if (de->file_type < F2FS_FT_MAX)
-			d_type = f2fs_filetype_table[de->file_type];
-		else
-			d_type = DT_UNKNOWN;
+		d_type = get_de_type(de);
 
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e1c07b60f301..3f1551395244 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1711,7 +1711,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
  */
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
-
+unsigned char get_de_type(struct f2fs_dir_entry *);
 struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1732,6 +1732,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
 void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
 			const struct qstr *, f2fs_hash_t , unsigned int);
+int f2fs_add_regular_entry(struct inode *, const struct qstr *,
+						struct inode *, nid_t, umode_t);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
 			umode_t);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a2fbe6f427d3..772056587eb9 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -355,7 +355,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
  * NOTE: ipage is grabbed by caller, but if any error occurs, we should
  * release ipage in this function.
  */
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 				struct f2fs_inline_dentry *inline_dentry)
 {
 	struct page *page;
@@ -416,6 +416,98 @@ out:
 	return err;
 }
 
+static int f2fs_add_inline_entries(struct inode *dir,
+			struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_dentry_ptr d;
+	unsigned long bit_pos = 0;
+	int err = 0;
+
+	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+
+	while (bit_pos < d.max) {
+		struct f2fs_dir_entry *de;
+		struct qstr new_name;
+		nid_t ino;
+		umode_t fake_mode;
+
+		if (!test_bit_le(bit_pos, d.bitmap)) {
+			bit_pos++;
+			continue;
+		}
+
+		de = &d.dentry[bit_pos];
+		new_name.name = d.filename[bit_pos];
+		new_name.len = de->name_len;
+
+		ino = le32_to_cpu(de->ino);
+		fake_mode = get_de_type(de) << S_SHIFT;
+
+		err = f2fs_add_regular_entry(dir, &new_name, NULL,
+							ino, fake_mode);
+		if (err)
+			goto punch_dentry_pages;
+
+		if (unlikely(!de->name_len))
+			d.max = -1;
+
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+	}
+	return 0;
+punch_dentry_pages:
+	truncate_inode_pages(&dir->i_data, 0);
+	truncate_blocks(dir, 0, false);
+	remove_dirty_inode(dir);
+	return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_inline_dentry *backup_dentry;
+	int err;
+
+	backup_dentry = kmalloc(sizeof(struct f2fs_inline_dentry),
+							GFP_F2FS_ZERO);
+	if (!backup_dentry)
+		return -ENOMEM;
+
+	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+	truncate_inline_inode(ipage, 0);
+
+	unlock_page(ipage);
+
+	err = f2fs_add_inline_entries(dir, backup_dentry);
+	if (err)
+		goto recover;
+
+	lock_page(ipage);
+
+	stat_dec_inline_dir(dir);
+	clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+	update_inode(dir, ipage);
+	kfree(backup_dentry);
+	return 0;
+recover:
+	lock_page(ipage);
+	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+	i_size_write(dir, MAX_INLINE_DATA);
+	update_inode(dir, ipage);
+	f2fs_put_page(ipage, 1);
+
+	kfree(backup_dentry);
+	return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	if (!F2FS_I(dir)->i_dir_level)
+		return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+	else
+		return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
 int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 			struct inode *inode, nid_t ino, umode_t mode)
 {
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index b90e9bdbd1dd..4c02c6521fef 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -508,4 +508,6 @@ enum {
 	F2FS_FT_MAX
 };
 
+#define S_SHIFT 12
+
 #endif  /* _LINUX_F2FS_FS_H */
-- 
cgit v1.2.3


From 522566376a3f8373fbd5ff75bb8a7a2da701c1a7 Mon Sep 17 00:00:00 2001
From: Aviya Erenfeld <aviya.erenfeld@intel.com>
Date: Thu, 14 Apr 2016 11:59:31 +0200
Subject: devcoredump: add scatterlist support

Add scatterlist support (dev_coredumpsg) to allow drivers to avoid
vmalloc() like dev_coredumpm(), while also avoiding the module
reference that the latter function requires.

This internally uses dev_coredumpm() with function inside the
devcoredump module, requiring removing the const
(which touches the driver using it.)

Signed-off-by: Aviya Erenfeld <aviya.erenfeld@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/devcoredump.c                      | 83 +++++++++++++++++++++---
 drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c |  4 +-
 include/linux/devcoredump.h                     | 86 ++++++++++++++++++++++---
 3 files changed, 154 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 1bd120a0b084..240374fd1838 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -4,6 +4,7 @@
  * GPL LICENSE SUMMARY
  *
  * Copyright(c) 2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -41,12 +42,12 @@ static bool devcd_disabled;
 
 struct devcd_entry {
 	struct device devcd_dev;
-	const void *data;
+	void *data;
 	size_t datalen;
 	struct module *owner;
 	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-			const void *data, size_t datalen);
-	void (*free)(const void *data);
+			void *data, size_t datalen);
+	void (*free)(void *data);
 	struct delayed_work del_wk;
 	struct device *failing_dev;
 };
@@ -174,7 +175,7 @@ static struct class devcd_class = {
 };
 
 static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
-			   const void *data, size_t datalen)
+			   void *data, size_t datalen)
 {
 	if (offset > datalen)
 		return -EINVAL;
@@ -188,6 +189,11 @@ static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
 	return count;
 }
 
+static void devcd_freev(void *data)
+{
+	vfree(data);
+}
+
 /**
  * dev_coredumpv - create device coredump with vmalloc data
  * @dev: the struct device for the crashed device
@@ -198,10 +204,10 @@ static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
  * This function takes ownership of the vmalloc'ed data and will free
  * it when it is no longer used. See dev_coredumpm() for more information.
  */
-void dev_coredumpv(struct device *dev, const void *data, size_t datalen,
+void dev_coredumpv(struct device *dev, void *data, size_t datalen,
 		   gfp_t gfp)
 {
-	dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, vfree);
+	dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, devcd_freev);
 }
 EXPORT_SYMBOL_GPL(dev_coredumpv);
 
@@ -212,6 +218,44 @@ static int devcd_match_failing(struct device *dev, const void *failing)
 	return devcd->failing_dev == failing;
 }
 
+/**
+ * devcd_free_sgtable - free all the memory of the given scatterlist table
+ * (i.e. both pages and scatterlist instances)
+ * NOTE: if two tables allocated with devcd_alloc_sgtable and then chained
+ * using the sg_chain function then that function should be called only once
+ * on the chained table
+ * @table: pointer to sg_table to free
+ */
+static void devcd_free_sgtable(void *data)
+{
+	_devcd_free_sgtable(data);
+}
+
+/**
+ * devcd_read_from_table - copy data from sg_table to a given buffer
+ * and return the number of bytes read
+ * @buffer: the buffer to copy the data to it
+ * @buf_len: the length of the buffer
+ * @data: the scatterlist table to copy from
+ * @offset: start copy from @offset@ bytes from the head of the data
+ *	in the given scatterlist
+ * @data_len: the length of the data in the sg_table
+ */
+static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
+				       size_t buf_len, void *data,
+				       size_t data_len)
+{
+	struct scatterlist *table = data;
+
+	if (offset > data_len)
+		return -EINVAL;
+
+	if (offset + buf_len > data_len)
+		buf_len = data_len - offset;
+	return sg_pcopy_to_buffer(table, sg_nents(table), buffer, buf_len,
+				  offset);
+}
+
 /**
  * dev_coredumpm - create device coredump with read/free methods
  * @dev: the struct device for the crashed device
@@ -228,10 +272,10 @@ static int devcd_match_failing(struct device *dev, const void *failing)
  * function will be called to free the data.
  */
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   const void *data, size_t datalen, gfp_t gfp,
+		   void *data, size_t datalen, gfp_t gfp,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-				   const void *data, size_t datalen),
-		   void (*free)(const void *data))
+				   void *data, size_t datalen),
+		   void (*free)(void *data))
 {
 	static atomic_t devcd_count = ATOMIC_INIT(0);
 	struct devcd_entry *devcd;
@@ -291,6 +335,27 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 }
 EXPORT_SYMBOL_GPL(dev_coredumpm);
 
+/**
+ * dev_coredumpmsg - create device coredump that uses scatterlist as data
+ * parameter
+ * @dev: the struct device for the crashed device
+ * @table: the dump data
+ * @datalen: length of the data
+ * @gfp: allocation flags
+ *
+ * Creates a new device coredump for the given device. If a previous one hasn't
+ * been read yet, the new coredump is discarded. The data lifetime is determined
+ * by the device coredump framework and when it is no longer needed
+ * it will free the data.
+ */
+void dev_coredumpsg(struct device *dev, struct scatterlist *table,
+		    size_t datalen, gfp_t gfp)
+{
+	dev_coredumpm(dev, NULL, table, datalen, gfp, devcd_read_from_sgtable,
+		      devcd_free_sgtable);
+}
+EXPORT_SYMBOL_GPL(dev_coredumpsg);
+
 static int __init devcoredump_init(void)
 {
 	return class_register(&devcd_class);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c
index 4856eac120f6..a4b0581d2275 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c
@@ -71,7 +71,7 @@
 #include "iwl-csr.h"
 
 static ssize_t iwl_mvm_read_coredump(char *buffer, loff_t offset, size_t count,
-				     const void *data, size_t datalen)
+				     void *data, size_t datalen)
 {
 	const struct iwl_mvm_dump_ptrs *dump_ptrs = data;
 	ssize_t bytes_read;
@@ -104,7 +104,7 @@ static ssize_t iwl_mvm_read_coredump(char *buffer, loff_t offset, size_t count,
 	return bytes_read + bytes_read_trans;
 }
 
-static void iwl_mvm_free_coredump(const void *data)
+static void iwl_mvm_free_coredump(void *data)
 {
 	const struct iwl_mvm_dump_ptrs *fw_error_dump = data;
 
diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index c0a360e99f64..269521f143ac 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -1,3 +1,22 @@
+/*
+ * This file is provided under the GPLv2 license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Deutschland GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called COPYING.
+ */
 #ifndef __DEVCOREDUMP_H
 #define __DEVCOREDUMP_H
 
@@ -5,17 +24,62 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+/*
+ * _devcd_free_sgtable - free all the memory of the given scatterlist table
+ * (i.e. both pages and scatterlist instances)
+ * NOTE: if two tables allocated and chained using the sg_chain function then
+ * this function should be called only once on the first table
+ * @table: pointer to sg_table to free
+ */
+static inline void _devcd_free_sgtable(struct scatterlist *table)
+{
+	int i;
+	struct page *page;
+	struct scatterlist *iter;
+	struct scatterlist *delete_iter;
+
+	/* free pages */
+	iter = table;
+	for_each_sg(table, iter, sg_nents(table), i) {
+		page = sg_page(iter);
+		if (page)
+			__free_page(page);
+	}
+
+	/* then free all chained tables */
+	iter = table;
+	delete_iter = table;	/* always points on a head of a table */
+	while (!sg_is_last(iter)) {
+		iter++;
+		if (sg_is_chain(iter)) {
+			iter = sg_chain_ptr(iter);
+			kfree(delete_iter);
+			delete_iter = iter;
+		}
+	}
+
+	/* free the last table */
+	kfree(delete_iter);
+}
+
+
 #ifdef CONFIG_DEV_COREDUMP
-void dev_coredumpv(struct device *dev, const void *data, size_t datalen,
+void dev_coredumpv(struct device *dev, void *data, size_t datalen,
 		   gfp_t gfp);
 
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   const void *data, size_t datalen, gfp_t gfp,
+		   void *data, size_t datalen, gfp_t gfp,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-				   const void *data, size_t datalen),
-		   void (*free)(const void *data));
+				   void *data, size_t datalen),
+		   void (*free)(void *data));
+
+void dev_coredumpsg(struct device *dev, struct scatterlist *table,
+		    size_t datalen, gfp_t gfp);
 #else
-static inline void dev_coredumpv(struct device *dev, const void *data,
+static inline void dev_coredumpv(struct device *dev, void *data,
 				 size_t datalen, gfp_t gfp)
 {
 	vfree(data);
@@ -23,13 +87,19 @@ static inline void dev_coredumpv(struct device *dev, const void *data,
 
 static inline void
 dev_coredumpm(struct device *dev, struct module *owner,
-	      const void *data, size_t datalen, gfp_t gfp,
+	      void *data, size_t datalen, gfp_t gfp,
 	      ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-			      const void *data, size_t datalen),
-	      void (*free)(const void *data))
+			      void *data, size_t datalen),
+	      void (*free)(void *data))
 {
 	free(data);
 }
+
+static inline void dev_coredumpsg(struct device *dev, struct scatterlist *table,
+				  size_t datalen, gfp_t gfp)
+{
+	_devcd_free_sgtable(table);
+}
 #endif /* CONFIG_DEV_COREDUMP */
 
 #endif /* __DEVCOREDUMP_H */
-- 
cgit v1.2.3


From c5cc2a0bc930f1ae00b198aeb752acc3bdd4d5a7 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Wed, 16 Mar 2016 21:54:55 +0200
Subject: clk: ti: dpll: add support for specifying max rate for DPLLs

DPLLs typically have a maximum rate they can support, and this varies
from DPLL to DPLL. Add support of the maximum rate value to the DPLL
data struct, and also add check for this in the DPLL round_rate function.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Reviewed-by: Nishanth Menon <nm@ti.com>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/ti/clkt_dpll.c | 3 +++
 include/linux/clk/ti.h     | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c
index 032c658a5f5e..b919fdfe8256 100644
--- a/drivers/clk/ti/clkt_dpll.c
+++ b/drivers/clk/ti/clkt_dpll.c
@@ -301,6 +301,9 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
 
 	dd = clk->dpll_data;
 
+	if (dd->max_rate && target_rate > dd->max_rate)
+		target_rate = dd->max_rate;
+
 	ref_rate = clk_hw_get_rate(dd->clk_ref);
 	clk_name = clk_hw_get_name(hw);
 	pr_debug("clock: %s: starting DPLL round_rate, target rate %lu\n",
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index dc5164a6df29..6110fe09ed18 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -37,6 +37,7 @@
  * @last_rounded_n: cache of the last N result of omap2_dpll_round_rate()
  * @min_divider: minimum valid non-bypass divider value (actual)
  * @max_divider: maximum valid non-bypass divider value (actual)
+ * @max_rate: maximum clock rate for the DPLL
  * @modes: possible values of @enable_mask
  * @autoidle_reg: register containing the DPLL autoidle mode bitfield
  * @idlest_reg: register containing the DPLL idle status bitfield
@@ -81,6 +82,7 @@ struct dpll_data {
 	u8			last_rounded_n;
 	u8			min_divider;
 	u16			max_divider;
+	unsigned long		max_rate;
 	u8			modes;
 	void __iomem		*autoidle_reg;
 	void __iomem		*idlest_reg;
-- 
cgit v1.2.3


From af8a41271b56f6d79cb4d7c7f3ca688a2d97a801 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Fri, 15 Apr 2016 16:59:38 +0200
Subject: iio:adis: Add support for manual self-test flag clear

Some variants of the devices from the ADIS family don't auto-clear the
self-test bit after the self-test has completed. Instead we have to
manually clear. Add support for this to the ADIS library.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/imu/adis.c       | 7 ++++++-
 include/linux/iio/imu/adis.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/imu/adis.c b/drivers/iio/imu/adis.c
index 911255d41c1a..ad6f91d06185 100644
--- a/drivers/iio/imu/adis.c
+++ b/drivers/iio/imu/adis.c
@@ -324,7 +324,12 @@ static int adis_self_test(struct adis *adis)
 
 	msleep(adis->data->startup_delay);
 
-	return adis_check_status(adis);
+	ret = adis_check_status(adis);
+
+	if (adis->data->self_test_no_autoclear)
+		adis_write_reg_16(adis, adis->data->msc_ctrl_reg, 0x00);
+
+	return ret;
 }
 
 /**
diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index fa2d01ef8f55..360da7d18a3d 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -41,6 +41,7 @@ struct adis_data {
 	unsigned int diag_stat_reg;
 
 	unsigned int self_test_mask;
+	bool self_test_no_autoclear;
 	unsigned int startup_delay;
 
 	const char * const *status_error_msgs;
-- 
cgit v1.2.3


From 20147f0d4f50f6f0d1fbe1815fe3d4d0a6444a70 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Tue, 29 Mar 2016 17:22:26 +0800
Subject: mfd: axp20x: Add support for AXP809 PMIC

The X-Powers AXP809 is a new PMIC that is paired with Allwinner's A80
SoC, along with a slave AXP806 PMIC.

This PMIC is quite similar to the earlier AXP223, though the interrupts
and regulator have changed a bit.

This patch adds support for the interrupts and power button of the PMIC.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x-rsb.c   |  1 +
 drivers/mfd/axp20x.c       | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 59 ++++++++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index 28c20247c112..a407527bcd09 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -61,6 +61,7 @@ static int axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
 
 static const struct of_device_id axp20x_rsb_of_match[] = {
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
+	{ .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, axp20x_rsb_of_match);
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index a57d6e940610..1ce923277cc8 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -37,6 +37,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP221",
 	"AXP223",
 	"AXP288",
+	"AXP809",
 };
 
 static const struct regmap_range axp152_writeable_ranges[] = {
@@ -85,6 +86,7 @@ static const struct regmap_access_table axp20x_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp20x_volatile_ranges),
 };
 
+/* AXP22x ranges are shared with the AXP809, as they cover the same range */
 static const struct regmap_range axp22x_writeable_ranges[] = {
 	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ5_STATE),
 	regmap_reg_range(AXP20X_DCDC_MODE, AXP22X_BATLOW_THRES1),
@@ -211,6 +213,20 @@ static struct resource axp288_fuel_gauge_resources[] = {
 	},
 };
 
+static struct resource axp809_pek_resources[] = {
+	{
+		.name   = "PEK_DBR",
+		.start  = AXP809_IRQ_PEK_RIS_EDGE,
+		.end    = AXP809_IRQ_PEK_RIS_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	}, {
+		.name   = "PEK_DBF",
+		.start  = AXP809_IRQ_PEK_FAL_EDGE,
+		.end    = AXP809_IRQ_PEK_FAL_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
 static const struct regmap_config axp152_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
@@ -378,6 +394,41 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP288, BC_USB_CHNG,            5, 1),
 };
 
+static const struct regmap_irq axp809_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP809, ACIN_OVER_V,		0, 7),
+	INIT_REGMAP_IRQ(AXP809, ACIN_PLUGIN,		0, 6),
+	INIT_REGMAP_IRQ(AXP809, ACIN_REMOVAL,	        0, 5),
+	INIT_REGMAP_IRQ(AXP809, VBUS_OVER_V,		0, 4),
+	INIT_REGMAP_IRQ(AXP809, VBUS_PLUGIN,		0, 3),
+	INIT_REGMAP_IRQ(AXP809, VBUS_REMOVAL,	        0, 2),
+	INIT_REGMAP_IRQ(AXP809, VBUS_V_LOW,		0, 1),
+	INIT_REGMAP_IRQ(AXP809, BATT_PLUGIN,		1, 7),
+	INIT_REGMAP_IRQ(AXP809, BATT_REMOVAL,	        1, 6),
+	INIT_REGMAP_IRQ(AXP809, BATT_ENT_ACT_MODE,	1, 5),
+	INIT_REGMAP_IRQ(AXP809, BATT_EXIT_ACT_MODE,	1, 4),
+	INIT_REGMAP_IRQ(AXP809, CHARG,		        1, 3),
+	INIT_REGMAP_IRQ(AXP809, CHARG_DONE,		1, 2),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_HIGH,	2, 7),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_HIGH_END,	2, 6),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_LOW,	2, 5),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_LOW_END,	2, 4),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_HIGH,	2, 3),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_HIGH_END,	2, 2),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_LOW,	2, 1),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_LOW_END,	2, 0),
+	INIT_REGMAP_IRQ(AXP809, DIE_TEMP_HIGH,	        3, 7),
+	INIT_REGMAP_IRQ(AXP809, LOW_PWR_LVL1,	        3, 1),
+	INIT_REGMAP_IRQ(AXP809, LOW_PWR_LVL2,	        3, 0),
+	INIT_REGMAP_IRQ(AXP809, TIMER,		        4, 7),
+	INIT_REGMAP_IRQ(AXP809, PEK_RIS_EDGE,	        4, 6),
+	INIT_REGMAP_IRQ(AXP809, PEK_FAL_EDGE,	        4, 5),
+	INIT_REGMAP_IRQ(AXP809, PEK_SHORT,		4, 4),
+	INIT_REGMAP_IRQ(AXP809, PEK_LONG,		4, 3),
+	INIT_REGMAP_IRQ(AXP809, PEK_OVER_OFF,		4, 2),
+	INIT_REGMAP_IRQ(AXP809, GPIO1_INPUT,		4, 1),
+	INIT_REGMAP_IRQ(AXP809, GPIO0_INPUT,		4, 0),
+};
+
 static const struct regmap_irq_chip axp152_regmap_irq_chip = {
 	.name			= "axp152_irq_chip",
 	.status_base		= AXP152_IRQ1_STATE,
@@ -428,6 +479,18 @@ static const struct regmap_irq_chip axp288_regmap_irq_chip = {
 
 };
 
+static const struct regmap_irq_chip axp809_regmap_irq_chip = {
+	.name			= "axp809",
+	.status_base		= AXP20X_IRQ1_STATE,
+	.ack_base		= AXP20X_IRQ1_STATE,
+	.mask_base		= AXP20X_IRQ1_EN,
+	.mask_invert		= true,
+	.init_ack_masked	= true,
+	.irqs			= axp809_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp809_regmap_irqs),
+	.num_regs		= 5,
+};
+
 static struct mfd_cell axp20x_cells[] = {
 	{
 		.name		= "axp20x-pek",
@@ -572,6 +635,16 @@ static struct mfd_cell axp288_cells[] = {
 	},
 };
 
+static struct mfd_cell axp809_cells[] = {
+	{
+		.name			= "axp20x-pek",
+		.num_resources		= ARRAY_SIZE(axp809_pek_resources),
+		.resources		= axp809_pek_resources,
+	}, {
+		.name			= "axp20x-regulator",
+	},
+};
+
 static struct axp20x_dev *axp20x_pm_power_off;
 static void axp20x_power_off(void)
 {
@@ -631,6 +704,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_cfg = &axp288_regmap_config;
 		axp20x->regmap_irq_chip = &axp288_regmap_irq_chip;
 		break;
+	case AXP809_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp809_cells);
+		axp20x->cells = axp809_cells;
+		axp20x->regmap_cfg = &axp22x_regmap_config;
+		axp20x->regmap_irq_chip = &axp809_regmap_irq_chip;
+		break;
 	default:
 		dev_err(dev, "unsupported AXP20X ID %lu\n", axp20x->variant);
 		return -EINVAL;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index d82e7d51372b..0be4982f08fe 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -20,6 +20,7 @@ enum {
 	AXP221_ID,
 	AXP223_ID,
 	AXP288_ID,
+	AXP809_ID,
 	NR_AXP20X_VARIANTS,
 };
 
@@ -264,6 +265,29 @@ enum {
 	AXP22X_REG_ID_MAX,
 };
 
+enum {
+	AXP809_DCDC1 = 0,
+	AXP809_DCDC2,
+	AXP809_DCDC3,
+	AXP809_DCDC4,
+	AXP809_DCDC5,
+	AXP809_DC1SW,
+	AXP809_DC5LDO,
+	AXP809_ALDO1,
+	AXP809_ALDO2,
+	AXP809_ALDO3,
+	AXP809_ELDO1,
+	AXP809_ELDO2,
+	AXP809_ELDO3,
+	AXP809_DLDO1,
+	AXP809_DLDO2,
+	AXP809_RTC_LDO,
+	AXP809_LDO_IO0,
+	AXP809_LDO_IO1,
+	AXP809_SW,
+	AXP809_REG_ID_MAX,
+};
+
 /* IRQs */
 enum {
 	AXP152_IRQ_LDO0IN_CONNECT = 1,
@@ -390,6 +414,41 @@ enum axp288_irqs {
 	AXP288_IRQ_BC_USB_CHNG,
 };
 
+enum axp809_irqs {
+	AXP809_IRQ_ACIN_OVER_V = 1,
+	AXP809_IRQ_ACIN_PLUGIN,
+	AXP809_IRQ_ACIN_REMOVAL,
+	AXP809_IRQ_VBUS_OVER_V,
+	AXP809_IRQ_VBUS_PLUGIN,
+	AXP809_IRQ_VBUS_REMOVAL,
+	AXP809_IRQ_VBUS_V_LOW,
+	AXP809_IRQ_BATT_PLUGIN,
+	AXP809_IRQ_BATT_REMOVAL,
+	AXP809_IRQ_BATT_ENT_ACT_MODE,
+	AXP809_IRQ_BATT_EXIT_ACT_MODE,
+	AXP809_IRQ_CHARG,
+	AXP809_IRQ_CHARG_DONE,
+	AXP809_IRQ_BATT_CHG_TEMP_HIGH,
+	AXP809_IRQ_BATT_CHG_TEMP_HIGH_END,
+	AXP809_IRQ_BATT_CHG_TEMP_LOW,
+	AXP809_IRQ_BATT_CHG_TEMP_LOW_END,
+	AXP809_IRQ_BATT_ACT_TEMP_HIGH,
+	AXP809_IRQ_BATT_ACT_TEMP_HIGH_END,
+	AXP809_IRQ_BATT_ACT_TEMP_LOW,
+	AXP809_IRQ_BATT_ACT_TEMP_LOW_END,
+	AXP809_IRQ_DIE_TEMP_HIGH,
+	AXP809_IRQ_LOW_PWR_LVL1,
+	AXP809_IRQ_LOW_PWR_LVL2,
+	AXP809_IRQ_TIMER,
+	AXP809_IRQ_PEK_RIS_EDGE,
+	AXP809_IRQ_PEK_FAL_EDGE,
+	AXP809_IRQ_PEK_SHORT,
+	AXP809_IRQ_PEK_LONG,
+	AXP809_IRQ_PEK_OVER_OFF,
+	AXP809_IRQ_GPIO1_INPUT,
+	AXP809_IRQ_GPIO0_INPUT,
+};
+
 #define AXP288_TS_ADC_H		0x58
 #define AXP288_TS_ADC_L		0x59
 #define AXP288_GP_ADC_H		0x5a
-- 
cgit v1.2.3


From a8f447be8056d9ce17bf7757d6de79426700bb8b Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Fri, 8 Apr 2016 00:12:55 +0530
Subject: mfd: Add resource managed APIs for mfd_add_devices

Add resource managed API devm_mfd_add_devices() for the mfd_add_devices().

This helps in reducing code in error path as it is not required
to call mfd_remove_devices() explicitly to remove all child-devices.
In some cases, it also helps not to implement .remove() callback
which get called during driver unbind.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mfd-core.c   | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/core.h |  4 ++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index 409da01effcd..4b4c1d4f3280 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -334,6 +334,44 @@ void mfd_remove_devices(struct device *parent)
 }
 EXPORT_SYMBOL(mfd_remove_devices);
 
+static void devm_mfd_dev_release(struct device *dev, void *res)
+{
+	mfd_remove_devices(dev);
+}
+
+/**
+ * devm_mfd_add_devices - Resource managed version of mfd_add_devices()
+ *
+ * Returns 0 on success or an appropriate negative error number on failure.
+ * All child-devices of the MFD will automatically be removed when it gets
+ * unbinded.
+ */
+int devm_mfd_add_devices(struct device *dev, int id,
+			 const struct mfd_cell *cells, int n_devs,
+			 struct resource *mem_base,
+			 int irq_base, struct irq_domain *domain)
+{
+	struct device **ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_mfd_dev_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = mfd_add_devices(dev, id, cells, n_devs, mem_base,
+			      irq_base, domain);
+	if (ret < 0) {
+		devres_free(ptr);
+		return ret;
+	}
+
+	*ptr = dev;
+	devres_add(dev, ptr);
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_mfd_add_devices);
+
 int mfd_clone_cell(const char *cell, const char **clones, size_t n_clones)
 {
 	struct mfd_cell cell_entry;
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index bc6f7e00fb3d..4a0268afe546 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -131,4 +131,8 @@ static inline int mfd_add_hotplug_devices(struct device *parent,
 
 extern void mfd_remove_devices(struct device *parent);
 
+extern int devm_mfd_add_devices(struct device *dev, int id,
+				const struct mfd_cell *cells, int n_devs,
+				struct resource *mem_base,
+				int irq_base, struct irq_domain *irq_domain);
 #endif
-- 
cgit v1.2.3


From 679ca39fc670a5a95c2b40d2cc8cf2cee2486f7a Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Mon, 18 Apr 2016 16:53:39 +0900
Subject: usb: gadget: udc: core: add usb_gadget_{un}map_request_by_dev()

If the following environment, the first argument of DMA API should
be set to a DMAC's device structure, not a udc controller's one.
 - A udc controller needs an external DMAC device (like a DMA Engine).
 - The external DMAC enables IOMMU.

So, this patch add usb_gadget_{un}map_request_by_dev() API to set
a DMAC's device structure by a udc controller driver.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/udc/udc-core.c | 24 ++++++++++++++++++------
 include/linux/usb/gadget.h        |  4 ++++
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/udc-core.c b/drivers/usb/gadget/udc/udc-core.c
index c6e76465065a..6e8300d6a737 100644
--- a/drivers/usb/gadget/udc/udc-core.c
+++ b/drivers/usb/gadget/udc/udc-core.c
@@ -61,11 +61,9 @@ static int udc_bind_to_driver(struct usb_udc *udc,
 
 #ifdef	CONFIG_HAS_DMA
 
-int usb_gadget_map_request(struct usb_gadget *gadget,
+int usb_gadget_map_request_by_dev(struct device *dev,
 		struct usb_request *req, int is_in)
 {
-	struct device *dev = gadget->dev.parent;
-
 	if (req->length == 0)
 		return 0;
 
@@ -92,24 +90,38 @@ int usb_gadget_map_request(struct usb_gadget *gadget,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_gadget_map_request_by_dev);
+
+int usb_gadget_map_request(struct usb_gadget *gadget,
+		struct usb_request *req, int is_in)
+{
+	return usb_gadget_map_request_by_dev(gadget->dev.parent, req, is_in);
+}
 EXPORT_SYMBOL_GPL(usb_gadget_map_request);
 
-void usb_gadget_unmap_request(struct usb_gadget *gadget,
+void usb_gadget_unmap_request_by_dev(struct device *dev,
 		struct usb_request *req, int is_in)
 {
 	if (req->length == 0)
 		return;
 
 	if (req->num_mapped_sgs) {
-		dma_unmap_sg(gadget->dev.parent, req->sg, req->num_mapped_sgs,
+		dma_unmap_sg(dev, req->sg, req->num_mapped_sgs,
 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
 		req->num_mapped_sgs = 0;
 	} else {
-		dma_unmap_single(gadget->dev.parent, req->dma, req->length,
+		dma_unmap_single(dev, req->dma, req->length,
 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	}
 }
+EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev);
+
+void usb_gadget_unmap_request(struct usb_gadget *gadget,
+		struct usb_request *req, int is_in)
+{
+	usb_gadget_unmap_request_by_dev(gadget->dev.parent, req, is_in);
+}
 EXPORT_SYMBOL_GPL(usb_gadget_unmap_request);
 
 #endif	/* CONFIG_HAS_DMA */
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 5d4e151c49bf..457651bf45b0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -1223,9 +1223,13 @@ int usb_otg_descriptor_init(struct usb_gadget *gadget,
 
 /* utility to simplify map/unmap of usb_requests to/from DMA */
 
+extern int usb_gadget_map_request_by_dev(struct device *dev,
+		struct usb_request *req, int is_in);
 extern int usb_gadget_map_request(struct usb_gadget *gadget,
 		struct usb_request *req, int is_in);
 
+extern void usb_gadget_unmap_request_by_dev(struct device *dev,
+		struct usb_request *req, int is_in);
 extern void usb_gadget_unmap_request(struct usb_gadget *gadget,
 		struct usb_request *req, int is_in);
 
-- 
cgit v1.2.3


From 7ef224d1d0e3a1ade02d02c01ce1dcffb736d2c3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:42 -0600
Subject: tracing: Add 'hist' event trigger command

'hist' triggers allow users to continually aggregate trace events,
which can then be viewed afterwards by simply reading a 'hist' file
containing the aggregation in a human-readable format.

The basic idea is very simple and boils down to a mechanism whereby
trace events, rather than being exhaustively dumped in raw form and
viewed directly, are automatically 'compressed' into meaningful tables
completely defined by the user.

This is done strictly via single-line command-line commands and
without the aid of any kind of programming language or interpreter.

A surprising number of typical use cases can be accomplished by users
via this simple mechanism.  In fact, a large number of the tasks that
users typically do using the more complicated script-based tracing
tools, at least during the initial stages of an investigation, can be
accomplished by simply specifying a set of keys and values to be used
in the creation of a hash table.

The Linux kernel trace event subsystem happens to provide an extensive
list of keys and values ready-made for such a purpose in the form of
the event format files associated with each trace event.  By simply
consulting the format file for field names of interest and by plugging
them into the hist trigger command, users can create an endless number
of useful aggregations to help with investigating various properties
of the system.  See Documentation/trace/events.txt for examples.

hist triggers are implemented on top of the existing event trigger
infrastructure, and as such are consistent with the existing triggers
from a user's perspective as well.

The basic syntax follows the existing trigger syntax.  Users start an
aggregation by writing a 'hist' trigger to the event of interest's
trigger file:

  # echo hist:keys=xxx [ if filter] > event/trigger

Once a hist trigger has been set up, by default it continually
aggregates every matching event into a hash table using the event key
and a value field named 'hitcount'.

To view the aggregation at any point in time, simply read the 'hist'
file in the same directory as the 'trigger' file:

  # cat event/hist

The detailed syntax provides additional options for user control, and
is described exhaustively in Documentation/trace/events.txt and in the
virtual tracing/README file in the tracing subsystem.

Link: http://lkml.kernel.org/r/72d263b5e1853fe9c314953b65833c3aa75479f2.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |   1 +
 kernel/trace/Kconfig                |  16 +
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.c                |  17 +
 kernel/trace/trace.h                |   7 +
 kernel/trace/trace_events.c         |   4 +
 kernel/trace/trace_events_hist.c    | 849 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |   1 +
 8 files changed, 896 insertions(+)
 create mode 100644 kernel/trace/trace_events_hist.c

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..404603720650 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -407,6 +407,7 @@ enum event_trigger_type {
 	ETT_SNAPSHOT		= (1 << 1),
 	ETT_STACKTRACE		= (1 << 2),
 	ETT_EVENT_ENABLE	= (1 << 3),
+	ETT_EVENT_HIST		= (1 << 4),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d39556fd863a..fafeaf803bd0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -538,6 +538,22 @@ config TRACING_MAP
 	  generally used outside of that context, and is normally
 	  selected by tracers that use it.
 
+config HIST_TRIGGERS
+	bool "Histogram triggers"
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select TRACING_MAP
+	default n
+	help
+	  Hist triggers allow one or more arbitrary trace event fields
+	  to be aggregated into hash tables and dumped to stdout by
+	  reading a debugfs/tracefs file.  They're useful for
+	  gathering quick and dirty (though precise) summaries of
+	  event activity as an initial guide for further investigation
+	  using more advanced tools.
+
+	  See Documentation/trace/events.txt.
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4255c4057aaa..979e7bfbde7a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0d12dbde8399..6cf8fd03b028 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3812,6 +3812,9 @@ static const char readme_msg[] =
 #endif
 #ifdef CONFIG_TRACER_SNAPSHOT
 	"\t\t    snapshot\n"
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t\t    hist (see below)\n"
 #endif
 	"\t   example: echo traceoff > events/block/block_unplug/trigger\n"
 	"\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
@@ -3828,6 +3831,20 @@ static const char readme_msg[] =
 	"\t   To remove a trigger with a count:\n"
 	"\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
 	"\t   Filters can be ignored when removing a trigger.\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
+	"\t    Format: hist:keys=<field1>\n"
+	"\t            [:size=#entries]\n"
+	"\t            [if <filter>]\n\n"
+	"\t    When a matching event is hit, an entry is added to a hash\n"
+	"\t    table using the key named, and the value of a sum called\n"
+	"\t    'hitcount' is incremented.  Keys correspond to fields in the\n"
+	"\t    event's format description.  Keys can be any field.  The\n"
+	"\t    'size' parameter can be  used to specify more or fewer than\n"
+	"\t    the default 2048 entries for the hashtable size.\n\n"
+	"\t    Reading the 'hist' file for the event will dump the hash\n"
+	"\t    table in its entirety to stdout."
+#endif
 ;
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2525042760e6..505f8a45f426 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1162,6 +1162,13 @@ extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_hist_fops;
+
+#ifdef CONFIG_HIST_TRIGGERS
+extern int register_trigger_hist_cmd(void);
+#else
+static inline int register_trigger_hist_cmd(void) { return 0; }
+#endif
 
 extern int register_trigger_cmds(void);
 extern void clear_event_triggers(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index add81dff7520..e7cb983ee93c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2141,6 +2141,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	trace_create_file("trigger", 0644, file->dir, file,
 			  &event_trigger_fops);
 
+#ifdef CONFIG_HIST_TRIGGERS
+	trace_create_file("hist", 0444, file->dir, file,
+			  &event_hist_fops);
+#endif
 	trace_create_file("format", 0444, file->dir, call,
 			  &ftrace_event_format_fops);
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
new file mode 100644
index 000000000000..23b45e462117
--- /dev/null
+++ b/kernel/trace/trace_events_hist.c
@@ -0,0 +1,849 @@
+/*
+ * trace_events_hist - trace event hist triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+struct hist_field;
+
+typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+
+struct hist_field {
+	struct ftrace_event_field	*field;
+	unsigned long			flags;
+	hist_field_fn_t			fn;
+	unsigned int			size;
+};
+
+static u64 hist_field_counter(struct hist_field *field, void *event)
+{
+	return 1;
+}
+
+static u64 hist_field_string(struct hist_field *hist_field, void *event)
+{
+	char *addr = (char *)(event + hist_field->field->offset);
+
+	return (u64)(unsigned long)addr;
+}
+
+#define DEFINE_HIST_FIELD_FN(type)					\
+static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+{									\
+	type *addr = (type *)(event + hist_field->field->offset);	\
+									\
+	return (u64)*addr;						\
+}
+
+DEFINE_HIST_FIELD_FN(s64);
+DEFINE_HIST_FIELD_FN(u64);
+DEFINE_HIST_FIELD_FN(s32);
+DEFINE_HIST_FIELD_FN(u32);
+DEFINE_HIST_FIELD_FN(s16);
+DEFINE_HIST_FIELD_FN(u16);
+DEFINE_HIST_FIELD_FN(s8);
+DEFINE_HIST_FIELD_FN(u8);
+
+#define for_each_hist_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_fields; (i)++)
+
+#define for_each_hist_val_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_vals; (i)++)
+
+#define for_each_hist_key_field(i, hist_data)	\
+	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
+
+#define HITCOUNT_IDX		0
+#define HIST_KEY_MAX		1
+#define HIST_KEY_SIZE_MAX	MAX_FILTER_STR_VAL
+
+enum hist_field_flags {
+	HIST_FIELD_FL_HITCOUNT	= 1,
+	HIST_FIELD_FL_KEY	= 2,
+	HIST_FIELD_FL_STRING	= 4,
+};
+
+struct hist_trigger_attrs {
+	char		*keys_str;
+	unsigned int	map_bits;
+};
+
+struct hist_trigger_data {
+	struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
+	unsigned int			n_vals;
+	unsigned int			n_keys;
+	unsigned int			n_fields;
+	unsigned int			key_size;
+	struct tracing_map_sort_key	sort_keys[TRACING_MAP_SORT_KEYS_MAX];
+	unsigned int			n_sort_keys;
+	struct trace_event_file		*event_file;
+	struct hist_trigger_attrs	*attrs;
+	struct tracing_map		*map;
+};
+
+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+{
+	hist_field_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = hist_field_s64;
+		else
+			fn = hist_field_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = hist_field_s32;
+		else
+			fn = hist_field_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = hist_field_s16;
+		else
+			fn = hist_field_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = hist_field_s8;
+		else
+			fn = hist_field_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int parse_map_size(char *str)
+{
+	unsigned long size, map_bits;
+	int ret;
+
+	strsep(&str, "=");
+	if (!str) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = kstrtoul(str, 0, &size);
+	if (ret)
+		goto out;
+
+	map_bits = ilog2(roundup_pow_of_two(size));
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		ret = -EINVAL;
+	else
+		ret = map_bits;
+ out:
+	return ret;
+}
+
+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
+{
+	if (!attrs)
+		return;
+
+	kfree(attrs->keys_str);
+	kfree(attrs);
+}
+
+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+{
+	struct hist_trigger_attrs *attrs;
+	int ret = 0;
+
+	attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return ERR_PTR(-ENOMEM);
+
+	while (trigger_str) {
+		char *str = strsep(&trigger_str, ":");
+
+		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+		    (strncmp(str, "keys=", strlen("keys=")) == 0))
+			attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "size=", strlen("size=")) == 0) {
+			int map_bits = parse_map_size(str);
+
+			if (map_bits < 0) {
+				ret = map_bits;
+				goto free;
+			}
+			attrs->map_bits = map_bits;
+		} else {
+			ret = -EINVAL;
+			goto free;
+		}
+	}
+
+	if (!attrs->keys_str) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	return attrs;
+ free:
+	destroy_hist_trigger_attrs(attrs);
+
+	return ERR_PTR(ret);
+}
+
+static void destroy_hist_field(struct hist_field *hist_field)
+{
+	kfree(hist_field);
+}
+
+static struct hist_field *create_hist_field(struct ftrace_event_field *field,
+					    unsigned long flags)
+{
+	struct hist_field *hist_field;
+
+	if (field && is_function_field(field))
+		return NULL;
+
+	hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+	if (!hist_field)
+		return NULL;
+
+	if (flags & HIST_FIELD_FL_HITCOUNT) {
+		hist_field->fn = hist_field_counter;
+		goto out;
+	}
+
+	if (is_string_field(field)) {
+		flags |= HIST_FIELD_FL_STRING;
+		hist_field->fn = hist_field_string;
+	} else {
+		hist_field->fn = select_value_fn(field->size,
+						 field->is_signed);
+		if (!hist_field->fn) {
+			destroy_hist_field(hist_field);
+			return NULL;
+		}
+	}
+ out:
+	hist_field->field = field;
+	hist_field->flags = flags;
+
+	return hist_field;
+}
+
+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+		if (hist_data->fields[i]) {
+			destroy_hist_field(hist_data->fields[i]);
+			hist_data->fields[i] = NULL;
+		}
+	}
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+	hist_data->fields[HITCOUNT_IDX] =
+		create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
+	if (!hist_data->fields[HITCOUNT_IDX])
+		return -ENOMEM;
+
+	hist_data->n_vals++;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int create_val_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_hitcount_val(hist_data);
+
+	return ret;
+}
+
+static int create_key_field(struct hist_trigger_data *hist_data,
+			    unsigned int key_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	struct ftrace_event_field *field = NULL;
+	unsigned long flags = 0;
+	unsigned int key_size;
+	int ret = 0;
+
+	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+		return -EINVAL;
+
+	flags |= HIST_FIELD_FL_KEY;
+
+	field = trace_find_event_field(file->event_call, field_str);
+	if (!field) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key_size = field->size;
+
+	hist_data->fields[key_idx] = create_hist_field(field, flags);
+	if (!hist_data->fields[key_idx]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key_size = ALIGN(key_size, sizeof(u64));
+	hist_data->fields[key_idx]->size = key_size;
+	hist_data->key_size = key_size;
+	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hist_data->n_keys++;
+
+	if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
+		return -EINVAL;
+
+	ret = key_size;
+ out:
+	return ret;
+}
+
+static int create_key_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	unsigned int i, n_vals = hist_data->n_vals;
+	char *fields_str, *field_str;
+	int ret = -EINVAL;
+
+	fields_str = hist_data->attrs->keys_str;
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str)
+		goto out;
+
+	for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) {
+		field_str = strsep(&fields_str, ",");
+		if (!field_str)
+			break;
+		ret = create_key_field(hist_data, i, file, field_str);
+		if (ret < 0)
+			goto out;
+	}
+	if (fields_str) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+static int create_hist_fields(struct hist_trigger_data *hist_data,
+			      struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_val_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	ret = create_key_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ out:
+	return ret;
+}
+
+static int create_sort_keys(struct hist_trigger_data *hist_data)
+{
+	int ret = 0;
+
+	hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */
+
+	return ret;
+}
+
+static void destroy_hist_data(struct hist_trigger_data *hist_data)
+{
+	destroy_hist_trigger_attrs(hist_data->attrs);
+	destroy_hist_fields(hist_data);
+	tracing_map_destroy(hist_data->map);
+	kfree(hist_data);
+}
+
+static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
+{
+	struct tracing_map *map = hist_data->map;
+	struct ftrace_event_field *field;
+	struct hist_field *hist_field;
+	unsigned int i, idx;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (hist_field->flags & HIST_FIELD_FL_KEY) {
+			tracing_map_cmp_fn_t cmp_fn;
+
+			field = hist_field->field;
+
+			if (is_string_field(field))
+				cmp_fn = tracing_map_cmp_string;
+			else
+				cmp_fn = tracing_map_cmp_num(field->size,
+							     field->is_signed);
+			idx = tracing_map_add_key_field(map, 0, cmp_fn);
+		} else
+			idx = tracing_map_add_sum_field(map);
+
+		if (idx < 0)
+			return idx;
+	}
+
+	return 0;
+}
+
+static struct hist_trigger_data *
+create_hist_data(unsigned int map_bits,
+		 struct hist_trigger_attrs *attrs,
+		 struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data;
+	int ret = 0;
+
+	hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL);
+	if (!hist_data)
+		return ERR_PTR(-ENOMEM);
+
+	hist_data->attrs = attrs;
+
+	ret = create_hist_fields(hist_data, file);
+	if (ret)
+		goto free;
+
+	ret = create_sort_keys(hist_data);
+	if (ret)
+		goto free;
+
+	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
+					    NULL, hist_data);
+	if (IS_ERR(hist_data->map)) {
+		ret = PTR_ERR(hist_data->map);
+		hist_data->map = NULL;
+		goto free;
+	}
+
+	ret = create_tracing_map_fields(hist_data);
+	if (ret)
+		goto free;
+
+	ret = tracing_map_init(hist_data->map);
+	if (ret)
+		goto free;
+
+	hist_data->event_file = file;
+ out:
+	return hist_data;
+ free:
+	hist_data->attrs = NULL;
+
+	destroy_hist_data(hist_data);
+
+	hist_data = ERR_PTR(ret);
+
+	goto out;
+}
+
+static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
+				    struct tracing_map_elt *elt,
+				    void *rec)
+{
+	struct hist_field *hist_field;
+	unsigned int i;
+	u64 hist_val;
+
+	for_each_hist_val_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		hist_val = hist_field->fn(hist_field, rec);
+		tracing_map_update_sum(elt, i, hist_val);
+	}
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	struct tracing_map_elt *elt;
+	u64 field_contents;
+	void *key = NULL;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		field_contents = key_field->fn(key_field, rec);
+		if (key_field->flags & HIST_FIELD_FL_STRING)
+			key = (void *)(unsigned long)field_contents;
+		else
+			key = (void *)&field_contents;
+	}
+
+	elt = tracing_map_insert(hist_data->map, key);
+	if (elt)
+		hist_trigger_elt_update(hist_data, elt, rec);
+}
+
+static void
+hist_trigger_entry_print(struct seq_file *m,
+			 struct hist_trigger_data *hist_data, void *key,
+			 struct tracing_map_elt *elt)
+{
+	struct hist_field *key_field;
+	unsigned int i;
+	u64 uval;
+
+	seq_puts(m, "{ ");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ", ");
+
+		if (key_field->flags & HIST_FIELD_FL_STRING) {
+			seq_printf(m, "%s: %-50s", key_field->field->name,
+				   (char *)key);
+		} else {
+			uval = *(u64 *)key;
+			seq_printf(m, "%s: %10llu",
+				   key_field->field->name, uval);
+		}
+	}
+
+	seq_puts(m, " }");
+
+	seq_printf(m, " hitcount: %10llu",
+		   tracing_map_read_sum(elt, HITCOUNT_IDX));
+
+	seq_puts(m, "\n");
+}
+
+static int print_entries(struct seq_file *m,
+			 struct hist_trigger_data *hist_data)
+{
+	struct tracing_map_sort_entry **sort_entries = NULL;
+	struct tracing_map *map = hist_data->map;
+	unsigned int i, n_entries;
+
+	n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
+					     hist_data->n_sort_keys,
+					     &sort_entries);
+	if (n_entries < 0)
+		return n_entries;
+
+	for (i = 0; i < n_entries; i++)
+		hist_trigger_entry_print(m, hist_data,
+					 sort_entries[i]->key,
+					 sort_entries[i]->elt);
+
+	tracing_map_destroy_sort_entries(sort_entries, n_entries);
+
+	return n_entries;
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+	struct event_trigger_data *test, *data = NULL;
+	struct trace_event_file *event_file;
+	struct hist_trigger_data *hist_data;
+	int n_entries, ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(test, &event_file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			data = test;
+			break;
+		}
+	}
+	if (!data)
+		goto out_unlock;
+
+	seq_puts(m, "# event histogram\n#\n# trigger info: ");
+	data->ops->print(m, data->ops, data);
+	seq_puts(m, "\n");
+
+	hist_data = data->private_data;
+	n_entries = print_entries(m, hist_data);
+	if (n_entries < 0) {
+		ret = n_entries;
+		n_entries = 0;
+	}
+
+	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
+		   (u64)atomic64_read(&hist_data->map->hits),
+		   n_entries, (u64)atomic64_read(&hist_data->map->drops));
+ out_unlock:
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int event_hist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hist_show, file);
+}
+
+const struct file_operations event_hist_fops = {
+	.open = event_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
+{
+	seq_printf(m, "%s", hist_field->field->name);
+}
+
+static int event_hist_trigger_print(struct seq_file *m,
+				    struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	unsigned int i;
+
+	seq_puts(m, "hist:keys=");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ",");
+
+		hist_field_print(m, key_field);
+	}
+
+	seq_puts(m, ":vals=");
+	seq_puts(m, "hitcount");
+
+	seq_puts(m, ":sort=");
+	seq_puts(m, "hitcount");
+
+	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+
+	if (data->filter_str)
+		seq_printf(m, " if %s", data->filter_str);
+
+	seq_puts(m, " [active]");
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static void event_hist_trigger_free(struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	data->ref--;
+	if (!data->ref) {
+		trigger_data_free(data);
+		destroy_hist_data(hist_data);
+	}
+}
+
+static struct event_trigger_ops event_hist_trigger_ops = {
+	.func			= event_hist_trigger,
+	.print			= event_hist_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_hist_trigger_free,
+};
+
+static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+							    char *param)
+{
+	return &event_hist_trigger_ops;
+}
+
+static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+				 struct event_trigger_data *data,
+				 struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+	int ret = 0;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	if (data->ops->init) {
+		ret = data->ops->init(data->ops, data);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_add_rcu(&data->list, &file->triggers);
+	ret++;
+
+	update_cond_flag(file);
+	if (trace_event_trigger_enable_disable(file, 1) < 0) {
+		list_del_rcu(&data->list);
+		update_cond_flag(file);
+		ret--;
+	}
+ out:
+	return ret;
+}
+
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+				   struct trace_event_file *file,
+				   char *glob, char *cmd, char *param)
+{
+	unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
+	struct event_trigger_data *trigger_data;
+	struct hist_trigger_attrs *attrs;
+	struct event_trigger_ops *trigger_ops;
+	struct hist_trigger_data *hist_data;
+	char *trigger;
+	int ret = 0;
+
+	if (!param)
+		return -EINVAL;
+
+	/* separate the trigger from the filter (k:v [if filter]) */
+	trigger = strsep(&param, " \t");
+	if (!trigger)
+		return -EINVAL;
+
+	attrs = parse_hist_trigger_attrs(trigger);
+	if (IS_ERR(attrs))
+		return PTR_ERR(attrs);
+
+	if (attrs->map_bits)
+		hist_trigger_bits = attrs->map_bits;
+
+	hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+	if (IS_ERR(hist_data)) {
+		destroy_hist_trigger_attrs(attrs);
+		return PTR_ERR(hist_data);
+	}
+
+	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+	ret = -ENOMEM;
+	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+	if (!trigger_data)
+		goto out_free;
+
+	trigger_data->count = -1;
+	trigger_data->ops = trigger_ops;
+	trigger_data->cmd_ops = cmd_ops;
+
+	INIT_LIST_HEAD(&trigger_data->list);
+	RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+	trigger_data->private_data = hist_data;
+
+	if (glob[0] == '!') {
+		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+		ret = 0;
+		goto out_free;
+	}
+
+	if (!param) /* if param is non-empty, it's supposed to be a filter */
+		goto out_reg;
+
+	if (!cmd_ops->set_filter)
+		goto out_reg;
+
+	ret = cmd_ops->set_filter(param, trigger_data, file);
+	if (ret < 0)
+		goto out_free;
+ out_reg:
+	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+	/*
+	 * The above returns on success the # of triggers registered,
+	 * but if it didn't register any it returns zero.  Consider no
+	 * triggers registered a failure too.
+	 */
+	if (!ret) {
+		ret = -ENOENT;
+		goto out_free;
+	} else if (ret < 0)
+		goto out_free;
+	/* Just return zero, not the number of registered triggers */
+	ret = 0;
+ out:
+	return ret;
+ out_free:
+	if (cmd_ops->set_filter)
+		cmd_ops->set_filter(NULL, trigger_data, NULL);
+
+	kfree(trigger_data);
+
+	destroy_hist_data(hist_data);
+	goto out;
+}
+
+static struct event_command trigger_hist_cmd = {
+	.name			= "hist",
+	.trigger_type		= ETT_EVENT_HIST,
+	.flags			= EVENT_CMD_FL_NEEDS_REC,
+	.func			= event_hist_trigger_func,
+	.reg			= hist_register_trigger,
+	.unreg			= unregister_trigger,
+	.get_trigger_ops	= event_hist_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+__init int register_trigger_hist_cmd(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_cmd);
+	WARN_ON(ret < 0);
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d67992f3bb0e..d29092afe005 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1447,6 +1447,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_cmd();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 97865fe41322d83dac4373fe0a0de5b1a1b318c5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 24 Mar 2016 14:18:05 +0100
Subject: iio: st_sensors: verify interrupt event to status

This makes all ST sensor drivers check that they actually have
new data available for the requested channel(s) before claiming
an IRQ, by reading the status register (which is conveniently
the same for all ST sensors) and check that the channel has new
data before proceeding to read it and fill the buffer.

This way sensors can share an interrupt line: it can be flaged
as shared and then the sensor that did not fire will return
NO_IRQ, and the sensor that fired will handle the IRQ and
return IRQ_HANDLED.

Cc: Giuseppe Barba <giuseppe.barba@st.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/st_accel_core.c                 |  5 +++++
 drivers/iio/common/st_sensors/st_sensors_buffer.c | 18 ++++++++++++++++++
 drivers/iio/gyro/st_gyro_core.c                   |  3 +++
 drivers/iio/magnetometer/st_magn_core.c           |  1 +
 drivers/iio/pressure/st_pressure_core.c           |  2 ++
 include/linux/iio/common/st_sensors.h             |  3 +++
 6 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index fee32e3d7a05..9fb6d35fce5b 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -332,6 +332,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_1_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_1_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_1_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_1_MULTIREAD_BIT,
 		.bootime = 2,
@@ -397,6 +398,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_2_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_2_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_2_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_2_MULTIREAD_BIT,
 		.bootime = 2,
@@ -474,6 +476,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_3_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_3_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_3_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 			.ig1 = {
 				.en_addr = ST_ACCEL_3_IG1_EN_ADDR,
 				.en_mask = ST_ACCEL_3_IG1_EN_MASK,
@@ -532,6 +535,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 		.drdy_irq = {
 			.addr = ST_ACCEL_4_DRDY_IRQ_ADDR,
 			.mask_int1 = ST_ACCEL_4_DRDY_IRQ_INT1_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_4_MULTIREAD_BIT,
 		.bootime = 2, /* guess */
@@ -583,6 +587,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_5_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_5_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_5_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_5_MULTIREAD_BIT,
 		.bootime = 2, /* guess */
diff --git a/drivers/iio/common/st_sensors/st_sensors_buffer.c b/drivers/iio/common/st_sensors/st_sensors_buffer.c
index 2ce0d2a3f855..c55898543a47 100644
--- a/drivers/iio/common/st_sensors/st_sensors_buffer.c
+++ b/drivers/iio/common/st_sensors/st_sensors_buffer.c
@@ -58,6 +58,24 @@ irqreturn_t st_sensors_trigger_handler(int irq, void *p)
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
 
+	/* If we have a status register, check if this IRQ came from us */
+	if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) {
+		u8 status;
+
+		len = sdata->tf->read_byte(&sdata->tb, sdata->dev,
+			   sdata->sensor_settings->drdy_irq.addr_stat_drdy,
+			   &status);
+		if (len < 0)
+			dev_err(sdata->dev, "could not read channel status\n");
+
+		/*
+		 * If this was not caused by any channels on this sensor,
+		 * return IRQ_NONE
+		 */
+		if (!(status & (u8)indio_dev->active_scan_mask[0]))
+			return IRQ_NONE;
+	}
+
 	len = st_sensors_get_buffer_element(indio_dev, sdata->buffer_data);
 	if (len < 0)
 		goto st_sensors_get_buffer_element_error;
diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index 110f95b6e52f..be9057e89dc3 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c
@@ -190,6 +190,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_GYRO_1_MULTIREAD_BIT,
 		.bootime = 2,
@@ -258,6 +259,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_GYRO_2_MULTIREAD_BIT,
 		.bootime = 2,
@@ -322,6 +324,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_GYRO_3_MULTIREAD_BIT,
 		.bootime = 2,
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 501f858df413..62036d2a9956 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -484,6 +484,7 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 			.mask_int1 = ST_MAGN_3_DRDY_INT_MASK,
 			.addr_ihl = ST_MAGN_3_IHL_IRQ_ADDR,
 			.mask_ihl = ST_MAGN_3_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_MAGN_3_MULTIREAD_BIT,
 		.bootime = 2,
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 172393ad34af..1cd37eaa4a57 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -226,6 +226,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS331AP_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS331AP_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS331AP_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS331AP_MULTIREAD_BIT,
 		.bootime = 2,
@@ -312,6 +313,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS25H_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS25H_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS25H_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS25H_MULTIREAD_BIT,
 		.bootime = 2,
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 6670c3d25c58..d8da075bfda0 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -37,6 +37,7 @@
 #define ST_SENSORS_DEFAULT_AXIS_ADDR		0x20
 #define ST_SENSORS_DEFAULT_AXIS_MASK		0x07
 #define ST_SENSORS_DEFAULT_AXIS_N_BIT		3
+#define ST_SENSORS_DEFAULT_STAT_ADDR		0x27
 
 #define ST_SENSORS_MAX_NAME			17
 #define ST_SENSORS_MAX_4WAI			7
@@ -121,6 +122,7 @@ struct st_sensor_bdu {
  * @mask_int2: mask to enable/disable IRQ on INT2 pin.
  * @addr_ihl: address to enable/disable active low on the INT lines.
  * @mask_ihl: mask to enable/disable active low on the INT lines.
+ * @addr_stat_drdy: address to read status of DRDY (data ready) interrupt
  * struct ig1 - represents the Interrupt Generator 1 of sensors.
  * @en_addr: address of the enable ig1 register.
  * @en_mask: mask to write the on/off value for enable.
@@ -131,6 +133,7 @@ struct st_sensor_data_ready_irq {
 	u8 mask_int2;
 	u8 addr_ihl;
 	u8 mask_ihl;
+	u8 addr_stat_drdy;
 	struct {
 		u8 en_addr;
 		u8 en_mask;
-- 
cgit v1.2.3


From 0e6f6871a1591f4bb0971809c45bc91a991f1967 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 14 Apr 2016 10:45:21 +0200
Subject: iio: st_sensors: support open drain mode

Some types of ST Sensors can be connected to the same IRQ line
as other peripherals using open drain. Add a device tree binding
and a sensor data property to flip the right bit in the interrupt
control register to enable open drain mode on the INT line.

If the line is set to be open drain, also tag on IRQF_SHARED
to the IRQ flags when requesting the interrupt, as the whole
point of using open drain interrupt lines is to share them with
more than one peripheral (wire-or).

Cc: devicetree@vger.kernel.org
Cc: Giuseppe Barba <giuseppe.barba@st.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Acked-by: Rob Herring <rob@kernel.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/devicetree/bindings/iio/st-sensors.txt |  4 ++++
 drivers/iio/accel/st_accel_core.c                    |  8 ++++++++
 drivers/iio/common/st_sensors/st_sensors_core.c      | 20 ++++++++++++++++++++
 drivers/iio/common/st_sensors/st_sensors_trigger.c   | 13 +++++++++++++
 drivers/iio/pressure/st_pressure_core.c              |  8 ++++++++
 include/linux/iio/common/st_sensors.h                |  6 ++++++
 include/linux/platform_data/st_sensors_pdata.h       |  2 ++
 7 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/iio/st-sensors.txt b/Documentation/devicetree/bindings/iio/st-sensors.txt
index 71b7bdff21cd..637e283f4a8b 100644
--- a/Documentation/devicetree/bindings/iio/st-sensors.txt
+++ b/Documentation/devicetree/bindings/iio/st-sensors.txt
@@ -16,6 +16,10 @@ Optional properties:
 - st,drdy-int-pin: the pin on the package that will be used to signal
   "data ready" (valid values: 1 or 2). This property is not configurable
   on all sensors.
+- drive-open-drain: the interrupt/data ready line will be configured
+  as open drain, which is useful if several sensors share the same
+  interrupt line. (This binding is taken from pinctrl/pinctrl-bindings.txt)
+  This is a boolean property.
 
 Sensors may also have applicable pin control settings, those use the
 standard bindings from pinctrl/pinctrl-bindings.txt.
diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 9fb6d35fce5b..dc73f2d85e6d 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -99,6 +99,8 @@
 #define ST_ACCEL_2_DRDY_IRQ_INT2_MASK		0x10
 #define ST_ACCEL_2_IHL_IRQ_ADDR			0x22
 #define ST_ACCEL_2_IHL_IRQ_MASK			0x80
+#define ST_ACCEL_2_OD_IRQ_ADDR			0x22
+#define ST_ACCEL_2_OD_IRQ_MASK			0x40
 #define ST_ACCEL_2_MULTIREAD_BIT		true
 
 /* CUSTOM VALUES FOR SENSOR 3 */
@@ -180,6 +182,8 @@
 #define ST_ACCEL_5_DRDY_IRQ_INT2_MASK		0x20
 #define ST_ACCEL_5_IHL_IRQ_ADDR			0x22
 #define ST_ACCEL_5_IHL_IRQ_MASK			0x80
+#define ST_ACCEL_5_OD_IRQ_ADDR			0x22
+#define ST_ACCEL_5_OD_IRQ_MASK			0x40
 #define ST_ACCEL_5_IG1_EN_ADDR			0x21
 #define ST_ACCEL_5_IG1_EN_MASK			0x08
 #define ST_ACCEL_5_MULTIREAD_BIT		false
@@ -398,6 +402,8 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_2_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_2_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_2_IHL_IRQ_MASK,
+			.addr_od = ST_ACCEL_2_OD_IRQ_ADDR,
+			.mask_od = ST_ACCEL_2_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_2_MULTIREAD_BIT,
@@ -587,6 +593,8 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_5_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_5_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_5_IHL_IRQ_MASK,
+			.addr_od = ST_ACCEL_5_OD_IRQ_ADDR,
+			.mask_od = ST_ACCEL_5_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_5_MULTIREAD_BIT,
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index f5a2d445d0c0..dffe00692169 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -301,6 +301,14 @@ static int st_sensors_set_drdy_int_pin(struct iio_dev *indio_dev,
 		return -EINVAL;
 	}
 
+	if (pdata->open_drain) {
+		if (!sdata->sensor_settings->drdy_irq.addr_od)
+			dev_err(&indio_dev->dev,
+				"open drain requested but unsupported.\n");
+		else
+			sdata->int_pin_open_drain = true;
+	}
+
 	return 0;
 }
 
@@ -321,6 +329,8 @@ static struct st_sensors_platform_data *st_sensors_of_probe(struct device *dev,
 	else
 		pdata->drdy_int_pin = defdata ? defdata->drdy_int_pin : 0;
 
+	pdata->open_drain = of_property_read_bool(np, "drive-open-drain");
+
 	return pdata;
 }
 #else
@@ -374,6 +384,16 @@ int st_sensors_init_sensor(struct iio_dev *indio_dev,
 			return err;
 	}
 
+	if (sdata->int_pin_open_drain) {
+		dev_info(&indio_dev->dev,
+			 "set interrupt line to open drain mode\n");
+		err = st_sensors_write_data_with_mask(indio_dev,
+				sdata->sensor_settings->drdy_irq.addr_od,
+				sdata->sensor_settings->drdy_irq.mask_od, 1);
+		if (err < 0)
+			return err;
+	}
+
 	err = st_sensors_set_axis_enable(indio_dev, ST_SENSORS_ENABLE_ALL_AXIS);
 
 	return err;
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index 6a8c98327945..da72279fcf99 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -64,6 +64,19 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 			"rising edge\n", irq_trig);
 		irq_trig = IRQF_TRIGGER_RISING;
 	}
+
+	/*
+	 * If the interrupt pin is Open Drain, by definition this
+	 * means that the interrupt line may be shared with other
+	 * peripherals. But to do this we also need to have a status
+	 * register and mask to figure out if this sensor was firing
+	 * the IRQ or not, so we can tell the interrupt handle that
+	 * it was "our" interrupt.
+	 */
+	if (sdata->int_pin_open_drain &&
+	    sdata->sensor_settings->drdy_irq.addr_stat_drdy)
+		irq_trig |= IRQF_SHARED;
+
 	err = request_threaded_irq(irq,
 			iio_trigger_generic_data_rdy_poll,
 			NULL,
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 1cd37eaa4a57..9e9b72a8f18f 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -64,6 +64,8 @@
 #define ST_PRESS_LPS331AP_DRDY_IRQ_INT2_MASK	0x20
 #define ST_PRESS_LPS331AP_IHL_IRQ_ADDR		0x22
 #define ST_PRESS_LPS331AP_IHL_IRQ_MASK		0x80
+#define ST_PRESS_LPS331AP_OD_IRQ_ADDR		0x22
+#define ST_PRESS_LPS331AP_OD_IRQ_MASK		0x40
 #define ST_PRESS_LPS331AP_MULTIREAD_BIT		true
 #define ST_PRESS_LPS331AP_TEMP_OFFSET		42500
 
@@ -104,6 +106,8 @@
 #define ST_PRESS_LPS25H_DRDY_IRQ_INT2_MASK	0x10
 #define ST_PRESS_LPS25H_IHL_IRQ_ADDR		0x22
 #define ST_PRESS_LPS25H_IHL_IRQ_MASK		0x80
+#define ST_PRESS_LPS25H_OD_IRQ_ADDR		0x22
+#define ST_PRESS_LPS25H_OD_IRQ_MASK		0x40
 #define ST_PRESS_LPS25H_MULTIREAD_BIT		true
 #define ST_PRESS_LPS25H_TEMP_OFFSET		42500
 #define ST_PRESS_LPS25H_OUT_XL_ADDR		0x28
@@ -226,6 +230,8 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS331AP_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS331AP_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS331AP_IHL_IRQ_MASK,
+			.addr_od = ST_PRESS_LPS331AP_OD_IRQ_ADDR,
+			.mask_od = ST_PRESS_LPS331AP_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS331AP_MULTIREAD_BIT,
@@ -313,6 +319,8 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS25H_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS25H_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS25H_IHL_IRQ_MASK,
+			.addr_od = ST_PRESS_LPS25H_OD_IRQ_ADDR,
+			.mask_od = ST_PRESS_LPS25H_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS25H_MULTIREAD_BIT,
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index d8da075bfda0..d029ffac0d69 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -122,6 +122,8 @@ struct st_sensor_bdu {
  * @mask_int2: mask to enable/disable IRQ on INT2 pin.
  * @addr_ihl: address to enable/disable active low on the INT lines.
  * @mask_ihl: mask to enable/disable active low on the INT lines.
+ * @addr_od: address to enable/disable Open Drain on the INT lines.
+ * @mask_od: mask to enable/disable Open Drain on the INT lines.
  * @addr_stat_drdy: address to read status of DRDY (data ready) interrupt
  * struct ig1 - represents the Interrupt Generator 1 of sensors.
  * @en_addr: address of the enable ig1 register.
@@ -133,6 +135,8 @@ struct st_sensor_data_ready_irq {
 	u8 mask_int2;
 	u8 addr_ihl;
 	u8 mask_ihl;
+	u8 addr_od;
+	u8 mask_od;
 	u8 addr_stat_drdy;
 	struct {
 		u8 en_addr;
@@ -215,6 +219,7 @@ struct st_sensor_settings {
  * @odr: Output data rate of the sensor [Hz].
  * num_data_channels: Number of data channels used in buffer.
  * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2).
+ * @int_pin_open_drain: Set the interrupt/DRDY to open drain.
  * @get_irq_data_ready: Function to get the IRQ used for data ready signal.
  * @tf: Transfer function structure used by I/O operations.
  * @tb: Transfer buffers and mutex used by I/O operations.
@@ -236,6 +241,7 @@ struct st_sensor_data {
 	unsigned int num_data_channels;
 
 	u8 drdy_int_pin;
+	bool int_pin_open_drain;
 
 	unsigned int (*get_irq_data_ready) (struct iio_dev *indio_dev);
 
diff --git a/include/linux/platform_data/st_sensors_pdata.h b/include/linux/platform_data/st_sensors_pdata.h
index 753839187ba0..79b0e4cdb814 100644
--- a/include/linux/platform_data/st_sensors_pdata.h
+++ b/include/linux/platform_data/st_sensors_pdata.h
@@ -16,9 +16,11 @@
  * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2).
  *	Available only for accelerometer and pressure sensors.
  *	Accelerometer DRDY on LSM330 available only on pin 1 (see datasheet).
+ * @open_drain: set the interrupt line to be open drain if possible.
  */
 struct st_sensors_platform_data {
 	u8 drdy_int_pin;
+	bool open_drain;
 };
 
 #endif /* ST_SENSORS_PDATA_H */
-- 
cgit v1.2.3


From 8bf872d8d261feefcdf67027522e3f717cad2bfe Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 6 Apr 2016 16:01:06 +0530
Subject: iio: core: Add devm_ APIs for iio_channel_{get,release}

Some of kernel driver uses the IIO framework to get the sensor
value via ADC or IIO HW driver. The client driver get iio channel
by iio_channel_get() and release it by calling iio_channel_release().

Add resource managed version (devm_*) of these APIs so that if client
calls the devm_iio_channel_get() then it need not to release it explicitly,
it can be done by managed device framework when driver get un-binded.

This reduces the code in error path and also need of .remove callback in
some cases.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/inkern.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iio/consumer.h | 27 +++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 2fc7928f401d..9fd8934c1887 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -356,6 +356,54 @@ void iio_channel_release(struct iio_channel *channel)
 }
 EXPORT_SYMBOL_GPL(iio_channel_release);
 
+static void devm_iio_channel_free(struct device *dev, void *res)
+{
+	struct iio_channel *channel = *(struct iio_channel **)res;
+
+	iio_channel_release(channel);
+}
+
+static int devm_iio_channel_match(struct device *dev, void *res, void *data)
+{
+	struct iio_channel **r = res;
+
+	if (!r || !*r) {
+		WARN_ON(!r || !*r);
+		return 0;
+	}
+
+	return *r == data;
+}
+
+struct iio_channel *devm_iio_channel_get(struct device *dev,
+					 const char *channel_name)
+{
+	struct iio_channel **ptr, *channel;
+
+	ptr = devres_alloc(devm_iio_channel_free, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	channel = iio_channel_get(dev, channel_name);
+	if (IS_ERR(channel)) {
+		devres_free(ptr);
+		return channel;
+	}
+
+	*ptr = channel;
+	devres_add(dev, ptr);
+
+	return channel;
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_get);
+
+void devm_iio_channel_release(struct device *dev, struct iio_channel *channel)
+{
+	WARN_ON(devres_release(dev, devm_iio_channel_free,
+			       devm_iio_channel_match, channel));
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_release);
+
 struct iio_channel *iio_channel_get_all(struct device *dev)
 {
 	const char *name;
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index fad58671c49e..e1e033d6a81f 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -48,6 +48,33 @@ struct iio_channel *iio_channel_get(struct device *dev,
  */
 void iio_channel_release(struct iio_channel *chan);
 
+/**
+ * devm_iio_channel_get() - Resource managed version of iio_channel_get().
+ * @dev:		Pointer to consumer device. Device name must match
+ *			the name of the device as provided in the iio_map
+ *			with which the desired provider to consumer mapping
+ *			was registered.
+ * @consumer_channel:	Unique name to identify the channel on the consumer
+ *			side. This typically describes the channels use within
+ *			the consumer. E.g. 'battery_voltage'
+ *
+ * Returns a pointer to negative errno if it is not able to get the iio channel
+ * otherwise returns valid pointer for iio channel.
+ *
+ * The allocated iio channel is automatically released when the device is
+ * unbound.
+ */
+struct iio_channel *devm_iio_channel_get(struct device *dev,
+					 const char *consumer_channel);
+/**
+ * devm_iio_channel_release() - Resource managed version of
+ *				iio_channel_release().
+ * @dev:		Pointer to consumer device for which resource
+ *			is allocared.
+ * @chan:		The channel to be released.
+ */
+void devm_iio_channel_release(struct device *dev, struct iio_channel *chan);
+
 /**
  * iio_channel_get_all() - get all channels associated with a client
  * @dev:		Pointer to consumer device.
-- 
cgit v1.2.3


From efc2c0133f198bc65593a67015af358919b0c48f Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 6 Apr 2016 16:01:07 +0530
Subject: iio: core: Add devm_ APIs for iio_channel_{get,release}_all

Some of kernel driver uses the IIO framework to get the sensor
value via ADC or IIO HW driver. The client driver get iio channel
by iio_channel_get_all() and release it by calling
iio_channel_release_all().

Add resource managed version (devm_*) of these APIs so that if client
calls the devm_iio_channel_get_all() then it need not to release it
explicitly, it can be done by managed device framework when driver
get un-binded.

This reduces the code in error path and also need of .remove callback in
some cases.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/inkern.c         | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/iio/consumer.h | 26 ++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 9fd8934c1887..c4757e6367e7 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -489,6 +489,42 @@ void iio_channel_release_all(struct iio_channel *channels)
 }
 EXPORT_SYMBOL_GPL(iio_channel_release_all);
 
+static void devm_iio_channel_free_all(struct device *dev, void *res)
+{
+	struct iio_channel *channels = *(struct iio_channel **)res;
+
+	iio_channel_release_all(channels);
+}
+
+struct iio_channel *devm_iio_channel_get_all(struct device *dev)
+{
+	struct iio_channel **ptr, *channels;
+
+	ptr = devres_alloc(devm_iio_channel_free_all, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	channels = iio_channel_get_all(dev);
+	if (IS_ERR(channels)) {
+		devres_free(ptr);
+		return channels;
+	}
+
+	*ptr = channels;
+	devres_add(dev, ptr);
+
+	return channels;
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_get_all);
+
+void devm_iio_channel_release_all(struct device *dev,
+				  struct iio_channel *channels)
+{
+	WARN_ON(devres_release(dev, devm_iio_channel_free_all,
+			       devm_iio_channel_match, channels));
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_release_all);
+
 static int iio_channel_read(struct iio_channel *chan, int *val, int *val2,
 	enum iio_chan_info_enum info)
 {
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index e1e033d6a81f..3d672f72e7ec 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -92,6 +92,32 @@ struct iio_channel *iio_channel_get_all(struct device *dev);
  */
 void iio_channel_release_all(struct iio_channel *chan);
 
+/**
+ * devm_iio_channel_get_all() - Resource managed version of
+ *				iio_channel_get_all().
+ * @dev: Pointer to consumer device.
+ *
+ * Returns a pointer to negative errno if it is not able to get the iio channel
+ * otherwise returns an array of iio_channel structures terminated with one with
+ * null iio_dev pointer.
+ *
+ * This function is used by fairly generic consumers to get all the
+ * channels registered as having this consumer.
+ *
+ * The allocated iio channels are automatically released when the device is
+ * unbounded.
+ */
+struct iio_channel *devm_iio_channel_get_all(struct device *dev);
+
+/**
+ * devm_iio_channel_release_all() - Resource managed version of
+ *				    iio_channel_release_all().
+ * @dev:		Pointer to consumer device for which resource
+ *			is allocared.
+ * @chan:		Array channel to be released.
+ */
+void devm_iio_channel_release_all(struct device *dev, struct iio_channel *chan);
+
 struct iio_cb_buffer;
 /**
  * iio_channel_get_all_cb() - register callback for triggered capture
-- 
cgit v1.2.3


From b0fcd8ab7b3c89b5da7fff5224d06ed73e7a33cc Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Wed, 23 Mar 2016 11:19:00 +0100
Subject: mtd: nand: add new enum for storing ECC algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our nand_ecc_modes_t is already a bit abused by value NAND_ECC_SOFT_BCH.
This enum should store ECC mode only and putting algorithm details there
is a bad idea. It would result in too many values impossible to support
in a sane way.

To solve this problem let's add a new enum. We'll have to modify all
drivers to set it properly but once it's done it'll be possible to drop
NAND_ECC_SOFT_BCH. That will result in a cleaner design and more
possibilities like setting ECC algorithm for hardware ECC mode.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/nand.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 56574ba36555..1b673e19667c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -119,6 +119,12 @@ typedef enum {
 	NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
+enum nand_ecc_algo {
+	NAND_ECC_UNKNOWN,
+	NAND_ECC_HAMMING,
+	NAND_ECC_BCH,
+};
+
 /*
  * Constants for Hardware ECC
  */
@@ -458,6 +464,7 @@ struct nand_hw_control {
 /**
  * struct nand_ecc_ctrl - Control structure for ECC
  * @mode:	ECC mode
+ * @algo:	ECC algorithm
  * @steps:	number of ECC steps per page
  * @size:	data bytes per ECC step
  * @bytes:	ECC bytes per step
@@ -508,6 +515,7 @@ struct nand_hw_control {
  */
 struct nand_ecc_ctrl {
 	nand_ecc_modes_t mode;
+	enum nand_ecc_algo algo;
 	int steps;
 	int size;
 	int bytes;
-- 
cgit v1.2.3


From dd2dcc004230b9d8fa809102cd326e3ee4bbdb2a Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Wed, 23 Mar 2016 11:19:01 +0100
Subject: of: mtd: prepare helper reading NAND ECC algo from DT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NAND subsystem is being slightly reworked to store ECC details in
separated fields. In future we'll want to add support for more DT
properties as specifying every possible setup with a single
"nand-ecc-mode" is a pretty bad idea.
To allow this let's add a helper that will support something like
"nand-ecc-algo" in future. Right now we use it for keeping backward
compatibility.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/of/of_mtd.c    | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/of_mtd.h |  6 ++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
index b7361ed70537..15d056e181d2 100644
--- a/drivers/of/of_mtd.c
+++ b/drivers/of/of_mtd.c
@@ -49,6 +49,42 @@ int of_get_nand_ecc_mode(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_get_nand_ecc_mode);
 
+/**
+ * of_get_nand_ecc_algo - Get nand ecc algorithm for given device_node
+ * @np:	Pointer to the given device_node
+ *
+ * The function gets ecc algorithm and returns its enum value, or errno in error
+ * case.
+ */
+int of_get_nand_ecc_algo(struct device_node *np)
+{
+	const char *pm;
+	int err;
+
+	/*
+	 * TODO: Read ECC algo OF property and map it to enum nand_ecc_algo.
+	 * It's not implemented yet as currently NAND subsystem ignores
+	 * algorithm explicitly set this way. Once it's handled we should
+	 * document & support new property.
+	 */
+
+	/*
+	 * For backward compatibility we also read "nand-ecc-mode" checking
+	 * for some obsoleted values that were specifying ECC algorithm.
+	 */
+	err = of_property_read_string(np, "nand-ecc-mode", &pm);
+	if (err < 0)
+		return err;
+
+	if (!strcasecmp(pm, "soft"))
+		return NAND_ECC_HAMMING;
+	else if (!strcasecmp(pm, "soft_bch"))
+		return NAND_ECC_BCH;
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(of_get_nand_ecc_algo);
+
 /**
  * of_get_nand_ecc_step_size - Get ECC step size associated to
  * the required ECC strength (see below).
diff --git a/include/linux/of_mtd.h b/include/linux/of_mtd.h
index e266caa36402..0f6aca5c6f2f 100644
--- a/include/linux/of_mtd.h
+++ b/include/linux/of_mtd.h
@@ -13,6 +13,7 @@
 
 #include <linux/of.h>
 int of_get_nand_ecc_mode(struct device_node *np);
+int of_get_nand_ecc_algo(struct device_node *np);
 int of_get_nand_ecc_step_size(struct device_node *np);
 int of_get_nand_ecc_strength(struct device_node *np);
 int of_get_nand_bus_width(struct device_node *np);
@@ -25,6 +26,11 @@ static inline int of_get_nand_ecc_mode(struct device_node *np)
 	return -ENOSYS;
 }
 
+static inline int of_get_nand_ecc_algo(struct device_node *np)
+{
+	return -ENOSYS;
+}
+
 static inline int of_get_nand_ecc_step_size(struct device_node *np)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From 7a654172161c8c9c7d59cbd0054d9e63c7411219 Mon Sep 17 00:00:00 2001
From: Raghav Dogra <raghav.dogra@nxp.com>
Date: Wed, 17 Feb 2016 16:54:18 +0530
Subject: mtd/ifc: Add support for IFC controller version 2.0

The new IFC controller version 2.0 has a different memory map page.
Upto IFC 1.4 PAGE size is 4 KB and from IFC2.0 PAGE size is 64KB.
This patch segregates the IFC global and runtime registers to appropriate
PAGE sizes.

Signed-off-by: Jaiprakash Singh <b44839@freescale.com>
Signed-off-by: Raghav Dogra <raghav@freescale.com>
Acked-by: Li Yang <leoyang.li@nxp.com>
Signed-off-by: Raghav Dogra <raghav.dogra@nxp.com>
Acked-by: Scott Wood <oss@buserror.net>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/memory/fsl_ifc.c        | 36 ++++++++++-----------
 drivers/mtd/nand/fsl_ifc_nand.c | 72 ++++++++++++++++++++++-------------------
 include/linux/fsl_ifc.h         | 45 +++++++++++++++++---------
 3 files changed, 87 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c
index 2a691da8c1c7..904b4af5f142 100644
--- a/drivers/memory/fsl_ifc.c
+++ b/drivers/memory/fsl_ifc.c
@@ -59,11 +59,11 @@ int fsl_ifc_find(phys_addr_t addr_base)
 {
 	int i = 0;
 
-	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->gregs)
 		return -ENODEV;
 
 	for (i = 0; i < fsl_ifc_ctrl_dev->banks; i++) {
-		u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->regs->cspr_cs[i].cspr);
+		u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->gregs->cspr_cs[i].cspr);
 		if (cspr & CSPR_V && (cspr & CSPR_BA) ==
 				convert_ifc_address(addr_base))
 			return i;
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(fsl_ifc_find);
 
 static int fsl_ifc_ctrl_init(struct fsl_ifc_ctrl *ctrl)
 {
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
 
 	/*
 	 * Clear all the common status and event registers
@@ -104,7 +104,7 @@ static int fsl_ifc_ctrl_remove(struct platform_device *dev)
 	irq_dispose_mapping(ctrl->nand_irq);
 	irq_dispose_mapping(ctrl->irq);
 
-	iounmap(ctrl->regs);
+	iounmap(ctrl->gregs);
 
 	dev_set_drvdata(&dev->dev, NULL);
 	kfree(ctrl);
@@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(nand_irq_lock);
 
 static u32 check_nand_stat(struct fsl_ifc_ctrl *ctrl)
 {
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	unsigned long flags;
 	u32 stat;
 
@@ -157,7 +157,7 @@ static irqreturn_t fsl_ifc_nand_irq(int irqno, void *data)
 static irqreturn_t fsl_ifc_ctrl_irq(int irqno, void *data)
 {
 	struct fsl_ifc_ctrl *ctrl = data;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
 	u32 err_axiid, err_srcid, status, cs_err, err_addr;
 	irqreturn_t ret = IRQ_NONE;
 
@@ -215,6 +215,7 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 {
 	int ret = 0;
 	int version, banks;
+	void __iomem *addr;
 
 	dev_info(&dev->dev, "Freescale Integrated Flash Controller\n");
 
@@ -225,22 +226,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 	dev_set_drvdata(&dev->dev, fsl_ifc_ctrl_dev);
 
 	/* IOMAP the entire IFC region */
-	fsl_ifc_ctrl_dev->regs = of_iomap(dev->dev.of_node, 0);
-	if (!fsl_ifc_ctrl_dev->regs) {
+	fsl_ifc_ctrl_dev->gregs = of_iomap(dev->dev.of_node, 0);
+	if (!fsl_ifc_ctrl_dev->gregs) {
 		dev_err(&dev->dev, "failed to get memory region\n");
 		ret = -ENODEV;
 		goto err;
 	}
 
-	version = ifc_in32(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
-			FSL_IFC_VERSION_MASK;
-	banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
-	dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
-		version >> 24, (version >> 16) & 0xf, banks);
-
-	fsl_ifc_ctrl_dev->version = version;
-	fsl_ifc_ctrl_dev->banks = banks;
-
 	if (of_property_read_bool(dev->dev.of_node, "little-endian")) {
 		fsl_ifc_ctrl_dev->little_endian = true;
 		dev_dbg(&dev->dev, "IFC REGISTERS are LITTLE endian\n");
@@ -249,8 +241,9 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 		dev_dbg(&dev->dev, "IFC REGISTERS are BIG endian\n");
 	}
 
-	version = ioread32be(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
+	version = ifc_in32(&fsl_ifc_ctrl_dev->gregs->ifc_rev) &
 			FSL_IFC_VERSION_MASK;
+
 	banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
 	dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
 		version >> 24, (version >> 16) & 0xf, banks);
@@ -258,6 +251,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 	fsl_ifc_ctrl_dev->version = version;
 	fsl_ifc_ctrl_dev->banks = banks;
 
+	addr = fsl_ifc_ctrl_dev->gregs;
+	if (version >= FSL_IFC_VERSION_2_0_0)
+		addr += PGOFFSET_64K;
+	else
+		addr += PGOFFSET_4K;
+	fsl_ifc_ctrl_dev->rregs = addr;
+
 	/* get the Controller level irq */
 	fsl_ifc_ctrl_dev->irq = irq_of_parse_and_map(dev->dev.of_node, 0);
 	if (fsl_ifc_ctrl_dev->irq == 0) {
diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index 43f5a3a4873f..f8a016f038cd 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -232,7 +232,7 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	int buf_num;
 
 	ifc_nand_ctrl->page = page_addr;
@@ -295,7 +295,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
 	struct fsl_ifc_nand_ctrl *nctrl = ifc_nand_ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	u32 eccstat[4];
 	int i;
 
@@ -371,7 +371,7 @@ static void fsl_ifc_do_read(struct nand_chip *chip,
 {
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
 	/* Program FIR/IFC_NAND_FCR0 for Small/Large page */
 	if (mtd->writesize > 512) {
@@ -411,7 +411,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
 	/* clear the read buffer */
 	ifc_nand_ctrl->read_bytes = 0;
@@ -723,7 +723,7 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
 {
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	u32 nand_fsr;
 
 	/* Use READ_STATUS command, but wait for the device to be ready */
@@ -825,39 +825,42 @@ static int fsl_ifc_chip_init_tail(struct mtd_info *mtd)
 static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 {
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
+	struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
 	uint32_t csor = 0, csor_8k = 0, csor_ext = 0;
 	uint32_t cs = priv->bank;
 
 	/* Save CSOR and CSOR_ext */
-	csor = ifc_in32(&ifc->csor_cs[cs].csor);
-	csor_ext = ifc_in32(&ifc->csor_cs[cs].csor_ext);
+	csor = ifc_in32(&ifc_global->csor_cs[cs].csor);
+	csor_ext = ifc_in32(&ifc_global->csor_cs[cs].csor_ext);
 
 	/* chage PageSize 8K and SpareSize 1K*/
 	csor_8k = (csor & ~(CSOR_NAND_PGS_MASK)) | 0x0018C000;
-	ifc_out32(csor_8k, &ifc->csor_cs[cs].csor);
-	ifc_out32(0x0000400, &ifc->csor_cs[cs].csor_ext);
+	ifc_out32(csor_8k, &ifc_global->csor_cs[cs].csor);
+	ifc_out32(0x0000400, &ifc_global->csor_cs[cs].csor_ext);
 
 	/* READID */
 	ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-		  (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
-		  (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
-		  &ifc->ifc_nand.nand_fir0);
+		    (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
+		    (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
+		    &ifc_runtime->ifc_nand.nand_fir0);
 	ifc_out32(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
-		  &ifc->ifc_nand.nand_fcr0);
-	ifc_out32(0x0, &ifc->ifc_nand.row3);
+		    &ifc_runtime->ifc_nand.nand_fcr0);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.row3);
 
-	ifc_out32(0x0, &ifc->ifc_nand.nand_fbcr);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.nand_fbcr);
 
 	/* Program ROW0/COL0 */
-	ifc_out32(0x0, &ifc->ifc_nand.row0);
-	ifc_out32(0x0, &ifc->ifc_nand.col0);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.row0);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.col0);
 
 	/* set the chip select for NAND Transaction */
-	ifc_out32(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
+	ifc_out32(cs << IFC_NAND_CSEL_SHIFT,
+		&ifc_runtime->ifc_nand.nand_csel);
 
 	/* start read seq */
-	ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
+	ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT,
+		&ifc_runtime->ifc_nand.nandseq_strt);
 
 	/* wait for command complete flag or timeout */
 	wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -867,14 +870,15 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 		printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n");
 
 	/* Restore CSOR and CSOR_ext */
-	ifc_out32(csor, &ifc->csor_cs[cs].csor);
-	ifc_out32(csor_ext, &ifc->csor_cs[cs].csor_ext);
+	ifc_out32(csor, &ifc_global->csor_cs[cs].csor);
+	ifc_out32(csor_ext, &ifc_global->csor_cs[cs].csor_ext);
 }
 
 static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 {
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
+	struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
 	struct nand_chip *chip = &priv->chip;
 	struct mtd_info *mtd = nand_to_mtd(&priv->chip);
 	struct nand_ecclayout *layout;
@@ -886,7 +890,8 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 
 	/* fill in nand_chip structure */
 	/* set up function call table */
-	if ((ifc_in32(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
+	if ((ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr))
+		& CSPR_PORT_SIZE_16)
 		chip->read_byte = fsl_ifc_read_byte16;
 	else
 		chip->read_byte = fsl_ifc_read_byte;
@@ -900,13 +905,14 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 	chip->bbt_td = &bbt_main_descr;
 	chip->bbt_md = &bbt_mirror_descr;
 
-	ifc_out32(0x0, &ifc->ifc_nand.ncfgr);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.ncfgr);
 
 	/* set up nand options */
 	chip->bbt_options = NAND_BBT_USE_FLASH;
 	chip->options = NAND_NO_SUBPAGE_WRITE;
 
-	if (ifc_in32(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
+	if (ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr)
+		& CSPR_PORT_SIZE_16) {
 		chip->read_byte = fsl_ifc_read_byte16;
 		chip->options |= NAND_BUSWIDTH_16;
 	} else {
@@ -919,7 +925,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 	chip->ecc.read_page = fsl_ifc_read_page;
 	chip->ecc.write_page = fsl_ifc_write_page;
 
-	csor = ifc_in32(&ifc->csor_cs[priv->bank].csor);
+	csor = ifc_in32(&ifc_global->csor_cs[priv->bank].csor);
 
 	/* Hardware generates ECC per 512 Bytes */
 	chip->ecc.size = 512;
@@ -1007,10 +1013,10 @@ static int fsl_ifc_chip_remove(struct fsl_ifc_mtd *priv)
 	return 0;
 }
 
-static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
+static int match_bank(struct fsl_ifc_global __iomem *ifc_global, int bank,
 		      phys_addr_t addr)
 {
-	u32 cspr = ifc_in32(&ifc->cspr_cs[bank].cspr);
+	u32 cspr = ifc_in32(&ifc_global->cspr_cs[bank].cspr);
 
 	if (!(cspr & CSPR_V))
 		return 0;
@@ -1024,7 +1030,7 @@ static DEFINE_MUTEX(fsl_ifc_nand_mutex);
 
 static int fsl_ifc_nand_probe(struct platform_device *dev)
 {
-	struct fsl_ifc_regs __iomem *ifc;
+	struct fsl_ifc_runtime __iomem *ifc;
 	struct fsl_ifc_mtd *priv;
 	struct resource res;
 	static const char *part_probe_types[]
@@ -1034,9 +1040,9 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 	struct device_node *node = dev->dev.of_node;
 	struct mtd_info *mtd;
 
-	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->rregs)
 		return -ENODEV;
-	ifc = fsl_ifc_ctrl_dev->regs;
+	ifc = fsl_ifc_ctrl_dev->rregs;
 
 	/* get, allocate and map the memory resource */
 	ret = of_address_to_resource(node, 0, &res);
@@ -1047,7 +1053,7 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 
 	/* find which chip select it is connected to */
 	for (bank = 0; bank < fsl_ifc_ctrl_dev->banks; bank++) {
-		if (match_bank(ifc, bank, res.start))
+		if (match_bank(fsl_ifc_ctrl_dev->gregs, bank, res.start))
 			break;
 	}
 
diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h
index 0023088b253b..3f9778cbc79d 100644
--- a/include/linux/fsl_ifc.h
+++ b/include/linux/fsl_ifc.h
@@ -39,6 +39,10 @@
 #define FSL_IFC_VERSION_MASK	0x0F0F0000
 #define FSL_IFC_VERSION_1_0_0	0x01000000
 #define FSL_IFC_VERSION_1_1_0	0x01010000
+#define FSL_IFC_VERSION_2_0_0	0x02000000
+
+#define PGOFFSET_64K	(64*1024)
+#define PGOFFSET_4K	(4*1024)
 
 /*
  * CSPR - Chip Select Property Register
@@ -723,20 +727,26 @@ struct fsl_ifc_nand {
 	__be32 nand_evter_en;
 	u32 res17[0x2];
 	__be32 nand_evter_intr_en;
-	u32 res18[0x2];
+	__be32 nand_vol_addr_stat;
+	u32 res18;
 	__be32 nand_erattr0;
 	__be32 nand_erattr1;
 	u32 res19[0x10];
 	__be32 nand_fsr;
-	u32 res20;
-	__be32 nand_eccstat[4];
-	u32 res21[0x20];
+	u32 res20[0x3];
+	__be32 nand_eccstat[6];
+	u32 res21[0x1c];
 	__be32 nanndcr;
 	u32 res22[0x2];
 	__be32 nand_autoboot_trgr;
 	u32 res23;
 	__be32 nand_mdr;
-	u32 res24[0x5C];
+	u32 res24[0x1C];
+	__be32 nand_dll_lowcfg0;
+	__be32 nand_dll_lowcfg1;
+	u32 res25;
+	__be32 nand_dll_lowstat;
+	u32 res26[0x3c];
 };
 
 /*
@@ -771,13 +781,12 @@ struct fsl_ifc_gpcm {
 	__be32 gpcm_erattr1;
 	__be32 gpcm_erattr2;
 	__be32 gpcm_stat;
-	u32 res4[0x1F3];
 };
 
 /*
  * IFC Controller Registers
  */
-struct fsl_ifc_regs {
+struct fsl_ifc_global {
 	__be32 ifc_rev;
 	u32 res1[0x2];
 	struct {
@@ -803,21 +812,26 @@ struct fsl_ifc_regs {
 	} ftim_cs[FSL_IFC_BANK_COUNT];
 	u32 res9[0x30];
 	__be32 rb_stat;
-	u32 res10[0x2];
+	__be32 rb_map;
+	__be32 wb_map;
 	__be32 ifc_gcr;
-	u32 res11[0x2];
+	u32 res10[0x2];
 	__be32 cm_evter_stat;
-	u32 res12[0x2];
+	u32 res11[0x2];
 	__be32 cm_evter_en;
-	u32 res13[0x2];
+	u32 res12[0x2];
 	__be32 cm_evter_intr_en;
-	u32 res14[0x2];
+	u32 res13[0x2];
 	__be32 cm_erattr0;
 	__be32 cm_erattr1;
-	u32 res15[0x2];
+	u32 res14[0x2];
 	__be32 ifc_ccr;
 	__be32 ifc_csr;
-	u32 res16[0x2EB];
+	__be32 ddr_ccr_low;
+};
+
+
+struct fsl_ifc_runtime {
 	struct fsl_ifc_nand ifc_nand;
 	struct fsl_ifc_nor ifc_nor;
 	struct fsl_ifc_gpcm ifc_gpcm;
@@ -831,7 +845,8 @@ extern int fsl_ifc_find(phys_addr_t addr_base);
 struct fsl_ifc_ctrl {
 	/* device info */
 	struct device			*dev;
-	struct fsl_ifc_regs __iomem	*regs;
+	struct fsl_ifc_global __iomem	*gregs;
+	struct fsl_ifc_runtime __iomem	*rregs;
 	int				irq;
 	int				nand_irq;
 	spinlock_t			lock;
-- 
cgit v1.2.3


From 9d02fc2a5129449581c3108c260e96377cf35f7e Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 26 Aug 2015 16:08:12 +0200
Subject: mtd: nand: export default read/write oob functions

Export the default read/write oob functions (for the standard and syndrome
scheme), so that drivers can use them for their raw implementation and
implement their own functions for the normal oob operation.

This is required if your ECC engine is capable of fixing some of the OOB
data. In this case you have to overload the ->read_oob() and ->write_oob(),
but if you don't specify the ->read/write_oob_raw() functions they are
assigned to the ->read/write_oob() implementation, which is not what you
want.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 18 ++++++++++--------
 include/linux/mtd/nand.h     | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0f0c5b190316..13fcddc8a10e 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1893,13 +1893,13 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-			     int page)
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
 	chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
 	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
 	return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_std);
 
 /**
  * nand_read_oob_syndrome - [REPLACEABLE] OOB data read function for HW ECC
@@ -1908,8 +1908,8 @@ static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-				  int page)
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			   int page)
 {
 	int length = mtd->oobsize;
 	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
@@ -1937,6 +1937,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 
 	return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_syndrome);
 
 /**
  * nand_write_oob_std - [REPLACEABLE] the most common OOB data write function
@@ -1944,8 +1945,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-			      int page)
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
 	int status = 0;
 	const uint8_t *buf = chip->oob_poi;
@@ -1960,6 +1960,7 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
 
 	return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_std);
 
 /**
  * nand_write_oob_syndrome - [REPLACEABLE] OOB data write function for HW ECC
@@ -1968,8 +1969,8 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_syndrome(struct mtd_info *mtd,
-				   struct nand_chip *chip, int page)
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			    int page)
 {
 	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
 	int eccsize = chip->ecc.size, length = mtd->oobsize;
@@ -2019,6 +2020,7 @@ static int nand_write_oob_syndrome(struct mtd_info *mtd,
 
 	return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_syndrome);
 
 /**
  * nand_do_read_oob - [INTERN] NAND read out-of-band
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 1b673e19667c..7e06afb8552c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1078,4 +1078,18 @@ int nand_check_erased_ecc_chunk(void *data, int datalen,
 				void *ecc, int ecclen,
 				void *extraoob, int extraooblen,
 				int threshold);
+
+/* Default write_oob implementation */
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default write_oob syndrome implementation */
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			    int page);
+
+/* Default read_oob implementation */
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default read_oob syndrome implementation */
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			   int page);
 #endif /* __LINUX_MTD_NAND_H */
-- 
cgit v1.2.3


From 75eb2cec251fda33c9bb716ecc372819abb9278a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 4 Feb 2016 09:52:30 +0100
Subject: mtd: add mtd_ooblayout_xxx() helper functions

In order to make the ecclayout definition completely dynamic we need to
rework the way the OOB layout are defined and iterated.

Create a few mtd_ooblayout_xxx() helpers to ease OOB bytes manipulation
and hide ecclayout internals to their users.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/mtdcore.c   | 400 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/mtd.h |  33 ++++
 2 files changed, 433 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index bee180bd11e7..0290c41e44fc 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1016,6 +1016,406 @@ int mtd_write_oob(struct mtd_info *mtd, loff_t to,
 }
 EXPORT_SYMBOL_GPL(mtd_write_oob);
 
+/**
+ * mtd_ooblayout_ecc - Get the OOB region definition of a specific ECC section
+ * @mtd: MTD device structure
+ * @section: ECC section. Depending on the layout you may have all the ECC
+ *	     bytes stored in a single contiguous section, or one section
+ *	     per ECC chunk (and sometime several sections for a single ECC
+ *	     ECC chunk)
+ * @oobecc: OOB region struct filled with the appropriate ECC position
+ *	    information
+ *
+ * This functions return ECC section information in the OOB area. I you want
+ * to get all the ECC bytes information, then you should call
+ * mtd_ooblayout_ecc(mtd, section++, oobecc) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+		      struct mtd_oob_region *oobecc)
+{
+	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
+
+	memset(oobecc, 0, sizeof(*oobecc));
+
+	if (!mtd || section < 0)
+		return -EINVAL;
+
+	if (!mtd->ecclayout)
+		return -ENOTSUPP;
+
+	/*
+	 * This logic allows us to reuse the ->ecclayout information and
+	 * expose them as ECC regions (as done for the OOB free regions).
+	 *
+	 * TODO: this should be dropped as soon as we get rid of the
+	 * ->ecclayout field.
+	 */
+	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
+		eccpos = mtd->ecclayout->eccpos[eccbyte];
+
+		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
+			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
+
+			if (eccpos + 1 == neccpos) {
+				length++;
+				continue;
+			}
+		}
+
+		if (section == cursection)
+			break;
+
+		length = 0;
+		cursection++;
+	}
+
+	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
+		return -ERANGE;
+
+	oobecc->length = length + 1;
+	oobecc->offset = eccpos - length;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc);
+
+/**
+ * mtd_ooblayout_free - Get the OOB region definition of a specific free
+ *			section
+ * @mtd: MTD device structure
+ * @section: Free section you are interested in. Depending on the layout
+ *	     you may have all the free bytes stored in a single contiguous
+ *	     section, or one section per ECC chunk plus an extra section
+ *	     for the remaining bytes (or other funky layout).
+ * @oobfree: OOB region struct filled with the appropriate free position
+ *	     information
+ *
+ * This functions return free bytes position in the OOB area. I you want
+ * to get all the free bytes information, then you should call
+ * mtd_ooblayout_free(mtd, section++, oobfree) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+		       struct mtd_oob_region *oobfree)
+{
+	memset(oobfree, 0, sizeof(*oobfree));
+
+	if (!mtd || section < 0)
+		return -EINVAL;
+
+	if (!mtd->ecclayout)
+		return -ENOTSUPP;
+
+	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE)
+		return -ERANGE;
+
+	oobfree->offset = mtd->ecclayout->oobfree[section].offset;
+	oobfree->length = mtd->ecclayout->oobfree[section].length;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_free);
+
+/**
+ * mtd_ooblayout_find_region - Find the region attached to a specific byte
+ * @mtd: mtd info structure
+ * @byte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: used to retrieve the ECC position
+ * @iter: iterator function. Should be either mtd_ooblayout_free or
+ *	  mtd_ooblayout_ecc depending on the region type you're searching for
+ *
+ * This functions returns the section id and oobregion information of a
+ * specific byte. For example, say you want to know where the 4th ECC byte is
+ * stored, you'll use:
+ *
+ * mtd_ooblayout_find_region(mtd, 3, &section, &oobregion, mtd_ooblayout_ecc);
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_find_region(struct mtd_info *mtd, int byte,
+				int *sectionp, struct mtd_oob_region *oobregion,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	int pos = 0, ret, section = 0;
+
+	memset(oobregion, 0, sizeof(*oobregion));
+
+	while (1) {
+		ret = iter(mtd, section, oobregion);
+		if (ret)
+			return ret;
+
+		if (pos + oobregion->length > byte)
+			break;
+
+		pos += oobregion->length;
+		section++;
+	}
+
+	/*
+	 * Adjust region info to make it start at the beginning at the
+	 * 'start' ECC byte.
+	 */
+	oobregion->offset += byte - pos;
+	oobregion->length -= byte - pos;
+	*sectionp = section;
+
+	return 0;
+}
+
+/**
+ * mtd_ooblayout_find_eccregion - Find the ECC region attached to a specific
+ *				  ECC byte
+ * @mtd: mtd info structure
+ * @eccbyte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: OOB region information
+ *
+ * Works like mtd_ooblayout_find_region() except it searches for a specific ECC
+ * byte.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+				 int *section,
+				 struct mtd_oob_region *oobregion)
+{
+	return mtd_ooblayout_find_region(mtd, eccbyte, section, oobregion,
+					 mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_find_eccregion);
+
+/**
+ * mtd_ooblayout_get_bytes - Extract OOB bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @buf: destination buffer to store OOB bytes
+ * @oobbuf: OOB buffer
+ * @start: first byte to retrieve
+ * @nbytes: number of bytes to retrieve
+ * @iter: section iterator
+ *
+ * Extract bytes attached to a specific category (ECC or free)
+ * from the OOB buffer and copy them into buf.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_get_bytes(struct mtd_info *mtd, u8 *buf,
+				const u8 *oobbuf, int start, int nbytes,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	struct mtd_oob_region oobregion = { };
+	int section = 0, ret;
+
+	ret = mtd_ooblayout_find_region(mtd, start, &section,
+					&oobregion, iter);
+
+	while (!ret) {
+		int cnt;
+
+		cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+		memcpy(buf, oobbuf + oobregion.offset, cnt);
+		buf += cnt;
+		nbytes -= cnt;
+
+		if (!nbytes)
+			break;
+
+		ret = iter(mtd, ++section, &oobregion);
+	}
+
+	return ret;
+}
+
+/**
+ * mtd_ooblayout_set_bytes - put OOB bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @buf: source buffer to get OOB bytes from
+ * @oobbuf: OOB buffer
+ * @start: first OOB byte to set
+ * @nbytes: number of OOB bytes to set
+ * @iter: section iterator
+ *
+ * Fill the OOB buffer with data provided in buf. The category (ECC or free)
+ * is selected by passing the appropriate iterator.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_set_bytes(struct mtd_info *mtd, const u8 *buf,
+				u8 *oobbuf, int start, int nbytes,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	struct mtd_oob_region oobregion = { };
+	int section = 0, ret;
+
+	ret = mtd_ooblayout_find_region(mtd, start, &section,
+					&oobregion, iter);
+
+	while (!ret) {
+		int cnt;
+
+		cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+		memcpy(oobbuf + oobregion.offset, buf, cnt);
+		buf += cnt;
+		nbytes -= cnt;
+
+		if (!nbytes)
+			break;
+
+		ret = iter(mtd, ++section, &oobregion);
+	}
+
+	return ret;
+}
+
+/**
+ * mtd_ooblayout_count_bytes - count the number of bytes in a OOB category
+ * @mtd: mtd info structure
+ * @iter: category iterator
+ *
+ * Count the number of bytes in a given category.
+ *
+ * Returns a positive value on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_count_bytes(struct mtd_info *mtd,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	struct mtd_oob_region oobregion = { };
+	int section = 0, ret, nbytes = 0;
+
+	while (1) {
+		ret = iter(mtd, section++, &oobregion);
+		if (ret) {
+			if (ret == -ERANGE)
+				ret = nbytes;
+			break;
+		}
+
+		nbytes += oobregion.length;
+	}
+
+	return ret;
+}
+
+/**
+ * mtd_ooblayout_get_eccbytes - extract ECC bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+			       const u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_get_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_eccbytes);
+
+/**
+ * mtd_ooblayout_set_eccbytes - set ECC bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get ECC bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_set_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+			       u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_set_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_eccbytes);
+
+/**
+ * mtd_ooblayout_get_databytes - extract data bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @databuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+				const u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_get_bytes(mtd, databuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_databytes);
+
+/**
+ * mtd_ooblayout_get_eccbytes - set data bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get data bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+				u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_set_bytes(mtd, databuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_databytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of free bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd)
+{
+	return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_freebytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of ECC bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
+{
+	return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
+
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index ef9fea4fc400..117ca1ff581d 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -108,6 +108,21 @@ struct nand_ecclayout {
 	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES_LARGE];
 };
 
+/**
+ * struct mtd_oob_region - oob region definition
+ * @offset: region offset
+ * @length: region length
+ *
+ * This structure describes a region of the OOB area, and is used
+ * to retrieve ECC or free bytes sections.
+ * Each section is defined by an offset within the OOB area and a
+ * length.
+ */
+struct mtd_oob_region {
+	u32 offset;
+	u32 length;
+};
+
 struct module;	/* only needed for owner field in mtd_info */
 
 struct mtd_info {
@@ -253,6 +268,24 @@ struct mtd_info {
 	int usecount;
 };
 
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+		      struct mtd_oob_region *oobecc);
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+				 int *section,
+				 struct mtd_oob_region *oobregion);
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+			       const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+			       u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+		       struct mtd_oob_region *oobfree);
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+				const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+				u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
+
 static inline void mtd_set_of_node(struct mtd_info *mtd,
 				   struct device_node *np)
 {
-- 
cgit v1.2.3


From 036d6543f85319ffe96afad6de73d3a220917a63 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 18:53:44 +0100
Subject: mtd: add mtd_set_ecclayout() helper function

Add an mtd_set_ecclayout() helper function to avoid direct accesses to the
mtd->ecclayout field. This will ease future reworks of ECC layout
definition.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/mtd.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 117ca1ff581d..e62da8462493 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -286,6 +286,12 @@ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
 int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
 int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
 
+static inline void mtd_set_ecclayout(struct mtd_info *mtd,
+				     struct nand_ecclayout *ecclayout)
+{
+	mtd->ecclayout = ecclayout;
+}
+
 static inline void mtd_set_of_node(struct mtd_info *mtd,
 				   struct device_node *np)
 {
-- 
cgit v1.2.3


From adbbc3bc827eb1f43a932d783f09ba55c8ec8379 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:01:31 +0100
Subject: mtd: create an mtd_ooblayout_ops struct to ease ECC layout definition

ECC layout definitions are currently exposed using the nand_ecclayout
struct which embeds oobfree and eccpos arrays with predefined size.
This approach was acceptable when NAND chips were providing relatively
small OOB regions, but MLC and TLC now provide OOB regions of several
hundreds of bytes, which implies a non negligible overhead for everybody
even those who only need to support legacy NANDs.

Create an mtd_ooblayout_ops interface providing the same functionality
(expose the ECC and oobfree layout) without the need for this huge
structure.

The mtd->ecclayout is now deprecated and should be replaced by the
equivalent mtd_ooblayout_ops. In the meantime we provide a wrapper around
the ->ecclayout field to ease migration to this new model.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/mtdchar.c   |   4 +-
 drivers/mtd/mtdconcat.c |   2 +-
 drivers/mtd/mtdcore.c   | 165 +++++++++++++++++++++++++++++++++++-------------
 drivers/mtd/mtdpart.c   |  23 ++++++-
 include/linux/mtd/mtd.h |  32 ++++++++--
 5 files changed, 174 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index cd64ab76dd7b..3fad2c7425b0 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -888,7 +888,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 	{
 		struct nand_oobinfo oi;
 
-		if (!mtd->ecclayout)
+		if (!mtd->ooblayout)
 			return -EOPNOTSUPP;
 
 		ret = get_oobinfo(mtd, &oi);
@@ -982,7 +982,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 	{
 		struct nand_ecclayout_user *usrlay;
 
-		if (!mtd->ecclayout)
+		if (!mtd->ooblayout)
 			return -EOPNOTSUPP;
 
 		usrlay = kmalloc(sizeof(*usrlay), GFP_KERNEL);
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 481565e5fbaa..d573606b91c2 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -777,7 +777,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	}
 
-	mtd_set_ecclayout(&concat->mtd, subdev[0]->ecclayout);
+	mtd_set_ooblayout(&concat->mtd, subdev[0]->ooblayout);
 
 	concat->num_subdev = num_devs;
 	concat->mtd.name = name;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 0290c41e44fc..134ed2f7b919 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1035,49 +1035,15 @@ EXPORT_SYMBOL_GPL(mtd_write_oob);
 int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
 		      struct mtd_oob_region *oobecc)
 {
-	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
-
 	memset(oobecc, 0, sizeof(*oobecc));
 
 	if (!mtd || section < 0)
 		return -EINVAL;
 
-	if (!mtd->ecclayout)
+	if (!mtd->ooblayout || !mtd->ooblayout->ecc)
 		return -ENOTSUPP;
 
-	/*
-	 * This logic allows us to reuse the ->ecclayout information and
-	 * expose them as ECC regions (as done for the OOB free regions).
-	 *
-	 * TODO: this should be dropped as soon as we get rid of the
-	 * ->ecclayout field.
-	 */
-	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
-		eccpos = mtd->ecclayout->eccpos[eccbyte];
-
-		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
-			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
-
-			if (eccpos + 1 == neccpos) {
-				length++;
-				continue;
-			}
-		}
-
-		if (section == cursection)
-			break;
-
-		length = 0;
-		cursection++;
-	}
-
-	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
-		return -ERANGE;
-
-	oobecc->length = length + 1;
-	oobecc->offset = eccpos - length;
-
-	return 0;
+	return mtd->ooblayout->ecc(mtd, section, oobecc);
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc);
 
@@ -1106,16 +1072,10 @@ int mtd_ooblayout_free(struct mtd_info *mtd, int section,
 	if (!mtd || section < 0)
 		return -EINVAL;
 
-	if (!mtd->ecclayout)
+	if (!mtd->ooblayout || !mtd->ooblayout->free)
 		return -ENOTSUPP;
 
-	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE)
-		return -ERANGE;
-
-	oobfree->offset = mtd->ecclayout->oobfree[section].offset;
-	oobfree->length = mtd->ecclayout->oobfree[section].length;
-
-	return 0;
+	return mtd->ooblayout->free(mtd, section, oobfree);
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_free);
 
@@ -1416,6 +1376,123 @@ int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
 
+/**
+ * mtd_ecclayout_ecc - Default ooblayout_ecc iterator implementation
+ * @mtd: MTD device structure
+ * @section: ECC section. Depending on the layout you may have all the ECC
+ *	     bytes stored in a single contiguous section, or one section
+ *	     per ECC chunk (and sometime several sections for a single ECC
+ *	     ECC chunk)
+ * @oobecc: OOB region struct filled with the appropriate ECC position
+ *	    information
+ *
+ * This function is just a wrapper around the mtd->ecclayout field and is
+ * here to ease the transition to the mtd_ooblayout_ops approach.
+ * All it does is convert the layout->eccpos information into proper oob
+ * region definitions.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ecclayout_ecc(struct mtd_info *mtd, int section,
+			     struct mtd_oob_region *oobecc)
+{
+	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
+
+	if (!mtd->ecclayout)
+		return -ENOTSUPP;
+
+	/*
+	 * This logic allows us to reuse the ->ecclayout information and
+	 * expose them as ECC regions (as done for the OOB free regions).
+	 *
+	 * TODO: this should be dropped as soon as we get rid of the
+	 * ->ecclayout field.
+	 */
+	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
+		eccpos = mtd->ecclayout->eccpos[eccbyte];
+
+		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
+			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
+
+			if (eccpos + 1 == neccpos) {
+				length++;
+				continue;
+			}
+		}
+
+		if (section == cursection)
+			break;
+
+		length = 0;
+		cursection++;
+	}
+
+	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
+		return -ERANGE;
+
+	oobecc->length = length + 1;
+	oobecc->offset = eccpos - length;
+
+	return 0;
+}
+
+/**
+ * mtd_ecclayout_ecc - Default ooblayout_free iterator implementation
+ * @mtd: MTD device structure
+ * @section: Free section. Depending on the layout you may have all the free
+ *	     bytes stored in a single contiguous section, or one section
+ *	     per ECC chunk (and sometime several sections for a single ECC
+ *	     ECC chunk)
+ * @oobfree: OOB region struct filled with the appropriate free position
+ *	     information
+ *
+ * This function is just a wrapper around the mtd->ecclayout field and is
+ * here to ease the transition to the mtd_ooblayout_ops approach.
+ * All it does is convert the layout->oobfree information into proper oob
+ * region definitions.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ecclayout_free(struct mtd_info *mtd, int section,
+			      struct mtd_oob_region *oobfree)
+{
+	struct nand_ecclayout *layout = mtd->ecclayout;
+
+	if (!layout)
+		return -ENOTSUPP;
+
+	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE ||
+	    !layout->oobfree[section].length)
+		return -ERANGE;
+
+	oobfree->offset = layout->oobfree[section].offset;
+	oobfree->length = layout->oobfree[section].length;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops mtd_ecclayout_wrapper_ops = {
+	.ecc = mtd_ecclayout_ecc,
+	.free = mtd_ecclayout_free,
+};
+
+/**
+ * mtd_set_ecclayout - Attach an ecclayout to an MTD device
+ * @mtd: MTD device structure
+ * @ecclayout: The ecclayout to attach to the device
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout)
+{
+	if (!mtd || !ecclayout)
+		return;
+
+	mtd->ecclayout = ecclayout;
+	mtd_set_ooblayout(mtd, &mtd_ecclayout_wrapper_ops);
+}
+EXPORT_SYMBOL_GPL(mtd_set_ecclayout);
+
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index f53d9d72b23a..1f13e32556f8 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -317,6 +317,27 @@ static int part_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	return res;
 }
 
+static int part_ooblayout_ecc(struct mtd_info *mtd, int section,
+			      struct mtd_oob_region *oobregion)
+{
+	struct mtd_part *part = mtd_to_part(mtd);
+
+	return mtd_ooblayout_ecc(part->master, section, oobregion);
+}
+
+static int part_ooblayout_free(struct mtd_info *mtd, int section,
+			       struct mtd_oob_region *oobregion)
+{
+	struct mtd_part *part = mtd_to_part(mtd);
+
+	return mtd_ooblayout_free(part->master, section, oobregion);
+}
+
+static const struct mtd_ooblayout_ops part_ooblayout_ops = {
+	.ecc = part_ooblayout_ecc,
+	.free = part_ooblayout_free,
+};
+
 static inline void free_partition(struct mtd_part *p)
 {
 	kfree(p->mtd.name);
@@ -533,7 +554,7 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 			part->name);
 	}
 
-	mtd_set_ecclayout(&slave->mtd, master->ecclayout);
+	mtd_set_ooblayout(&slave->mtd, &part_ooblayout_ops);
 	slave->mtd.ecc_step_size = master->ecc_step_size;
 	slave->mtd.ecc_strength = master->ecc_strength;
 	slave->mtd.bitflip_threshold = master->bitflip_threshold;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index e62da8462493..177bf314ad70 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -101,6 +101,9 @@ struct mtd_oob_ops {
  * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
  * for export to user-space via the ECCGETLAYOUT ioctl.
  * nand_ecclayout should be expandable in the future simply by the above macros.
+ *
+ * This structure is now deprecated, you should use struct nand_ecclayout_ops
+ * to describe your OOB layout.
  */
 struct nand_ecclayout {
 	__u32 eccbytes;
@@ -123,6 +126,22 @@ struct mtd_oob_region {
 	u32 length;
 };
 
+/*
+ * struct mtd_ooblayout_ops - NAND OOB layout operations
+ * @ecc: function returning an ECC region in the OOB area.
+ *	 Should return -ERANGE if %section exceeds the total number of
+ *	 ECC sections.
+ * @free: function returning a free region in the OOB area.
+ *	  Should return -ERANGE if %section exceeds the total number of
+ *	  free sections.
+ */
+struct mtd_ooblayout_ops {
+	int (*ecc)(struct mtd_info *mtd, int section,
+		   struct mtd_oob_region *oobecc);
+	int (*free)(struct mtd_info *mtd, int section,
+		    struct mtd_oob_region *oobfree);
+};
+
 struct module;	/* only needed for owner field in mtd_info */
 
 struct mtd_info {
@@ -181,9 +200,12 @@ struct mtd_info {
 	const char *name;
 	int index;
 
-	/* ECC layout structure pointer - read only! */
+	/* [Deprecated] ECC layout structure pointer - read only! */
 	struct nand_ecclayout *ecclayout;
 
+	/* OOB layout description */
+	const struct mtd_ooblayout_ops *ooblayout;
+
 	/* the ecc step size. */
 	unsigned int ecc_step_size;
 
@@ -286,10 +308,12 @@ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
 int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
 int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
 
-static inline void mtd_set_ecclayout(struct mtd_info *mtd,
-				     struct nand_ecclayout *ecclayout)
+void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout);
+
+static inline void mtd_set_ooblayout(struct mtd_info *mtd,
+				     const struct mtd_ooblayout_ops *ooblayout)
 {
-	mtd->ecclayout = ecclayout;
+	mtd->ooblayout = ooblayout;
 }
 
 static inline void mtd_set_of_node(struct mtd_info *mtd,
-- 
cgit v1.2.3


From 41b207a70d3a86b9e2eede155e87838234c7cbd5 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:06:15 +0100
Subject: mtd: nand: implement the default mtd_ooblayout_ops

Replace the default nand_ecclayout definitions for large and small page
devices with the equivalent mtd_ooblayout_ops.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 142 ++++++++++++++++++++++++++++---------------
 include/linux/mtd/nand.h     |   3 +
 2 files changed, 96 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index feb1448e9dd6..e8332ea45739 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -47,54 +47,96 @@
 #include <linux/mtd/partitions.h>
 #include <linux/of_mtd.h>
 
+static int nand_get_device(struct mtd_info *mtd, int new_state);
+
+static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops);
+
 /* Define default oob placement schemes for large and small page devices */
-static struct nand_ecclayout nand_oob_8 = {
-	.eccbytes = 3,
-	.eccpos = {0, 1, 2},
-	.oobfree = {
-		{.offset = 3,
-		 .length = 2},
-		{.offset = 6,
-		 .length = 2} }
-};
+static int nand_ooblayout_ecc_sp(struct mtd_info *mtd, int section,
+				 struct mtd_oob_region *oobregion)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static struct nand_ecclayout nand_oob_16 = {
-	.eccbytes = 6,
-	.eccpos = {0, 1, 2, 3, 6, 7},
-	.oobfree = {
-		{.offset = 8,
-		 . length = 8} }
-};
+	if (section > 1)
+		return -ERANGE;
 
-static struct nand_ecclayout nand_oob_64 = {
-	.eccbytes = 24,
-	.eccpos = {
-		   40, 41, 42, 43, 44, 45, 46, 47,
-		   48, 49, 50, 51, 52, 53, 54, 55,
-		   56, 57, 58, 59, 60, 61, 62, 63},
-	.oobfree = {
-		{.offset = 2,
-		 .length = 38} }
-};
+	if (!section) {
+		oobregion->offset = 0;
+		oobregion->length = 4;
+	} else {
+		oobregion->offset = 6;
+		oobregion->length = ecc->total - 4;
+	}
 
-static struct nand_ecclayout nand_oob_128 = {
-	.eccbytes = 48,
-	.eccpos = {
-		   80, 81, 82, 83, 84, 85, 86, 87,
-		   88, 89, 90, 91, 92, 93, 94, 95,
-		   96, 97, 98, 99, 100, 101, 102, 103,
-		   104, 105, 106, 107, 108, 109, 110, 111,
-		   112, 113, 114, 115, 116, 117, 118, 119,
-		   120, 121, 122, 123, 124, 125, 126, 127},
-	.oobfree = {
-		{.offset = 2,
-		 .length = 78} }
+	return 0;
+}
+
+static int nand_ooblayout_free_sp(struct mtd_info *mtd, int section,
+				  struct mtd_oob_region *oobregion)
+{
+	if (section > 1)
+		return -ERANGE;
+
+	if (mtd->oobsize == 16) {
+		if (section)
+			return -ERANGE;
+
+		oobregion->length = 8;
+		oobregion->offset = 8;
+	} else {
+		oobregion->length = 2;
+		if (!section)
+			oobregion->offset = 3;
+		else
+			oobregion->offset = 6;
+	}
+
+	return 0;
+}
+
+const struct mtd_ooblayout_ops nand_ooblayout_sp_ops = {
+	.ecc = nand_ooblayout_ecc_sp,
+	.free = nand_ooblayout_free_sp,
 };
+EXPORT_SYMBOL_GPL(nand_ooblayout_sp_ops);
 
-static int nand_get_device(struct mtd_info *mtd, int new_state);
+static int nand_ooblayout_ecc_lp(struct mtd_info *mtd, int section,
+				 struct mtd_oob_region *oobregion)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
-			     struct mtd_oob_ops *ops);
+	if (section)
+		return -ERANGE;
+
+	oobregion->length = ecc->total;
+	oobregion->offset = mtd->oobsize - oobregion->length;
+
+	return 0;
+}
+
+static int nand_ooblayout_free_lp(struct mtd_info *mtd, int section,
+				  struct mtd_oob_region *oobregion)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	struct nand_ecc_ctrl *ecc = &chip->ecc;
+
+	if (section)
+		return -ERANGE;
+
+	oobregion->length = mtd->oobsize - ecc->total - 2;
+	oobregion->offset = 2;
+
+	return 0;
+}
+
+const struct mtd_ooblayout_ops nand_ooblayout_lp_ops = {
+	.ecc = nand_ooblayout_ecc_lp,
+	.free = nand_ooblayout_free_lp,
+};
+EXPORT_SYMBOL_GPL(nand_ooblayout_lp_ops);
 
 static int check_offs_len(struct mtd_info *mtd,
 					loff_t ofs, uint64_t len)
@@ -4109,22 +4151,25 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Set the internal oob buffer location, just after the page data */
 	chip->oob_poi = chip->buffers->databuf + mtd->writesize;
 
+	/*
+	 * Set the provided ECC layout. If ecc->layout is NULL, the MTD core
+	 * will just leave mtd->ooblayout to NULL, if it's not NULL, it will
+	 * set ->ooblayout to the default ecclayout wrapper.
+	 */
+	mtd_set_ecclayout(mtd, ecc->layout);
+
 	/*
 	 * If no default placement scheme is given, select an appropriate one.
 	 */
-	if (!ecc->layout && (ecc->mode != NAND_ECC_SOFT_BCH)) {
+	if (!mtd->ooblayout && (ecc->mode != NAND_ECC_SOFT_BCH)) {
 		switch (mtd->oobsize) {
 		case 8:
-			ecc->layout = &nand_oob_8;
-			break;
 		case 16:
-			ecc->layout = &nand_oob_16;
+			mtd_set_ooblayout(mtd, &nand_ooblayout_sp_ops);
 			break;
 		case 64:
-			ecc->layout = &nand_oob_64;
-			break;
 		case 128:
-			ecc->layout = &nand_oob_128;
+			mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
 			break;
 		default:
 			WARN(1, "No oob scheme defined for oobsize %d\n",
@@ -4285,7 +4330,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 		ecc->write_oob_raw = ecc->write_oob;
 
 	/* propagate ecc info to mtd_info */
-	mtd_set_ecclayout(mtd, ecc->layout);
 	mtd->ecc_strength = ecc->strength;
 	mtd->ecc_step_size = ecc->size;
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7e06afb8552c..f2ded7b1b3b8 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -748,6 +748,9 @@ struct nand_chip {
 	void *priv;
 };
 
+extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
+extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops;
+
 static inline void nand_set_flash_node(struct nand_chip *chip,
 				       struct device_node *np)
 {
-- 
cgit v1.2.3


From 74e1fbb1375a3ede3e17da22911761ce9bc8f53f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:17 +0200
Subject: of: Introduce struct of_phandle_iterator

This struct carrys all necessary information to iterate over
a list of phandles and extract the arguments. Add an
init-function for the iterator and make use of it in
__of_parse_phandle_with_args().

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 99 +++++++++++++++++++++++++++++++++---------------------
 include/linux/of.h | 33 ++++++++++++++++++
 2 files changed, 93 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index b299de2b3afa..1c6f43b5737d 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1440,35 +1440,56 @@ void of_print_phandle_args(const char *msg, const struct of_phandle_args *args)
 	printk("\n");
 }
 
+int of_phandle_iterator_init(struct of_phandle_iterator *it,
+		const struct device_node *np,
+		const char *list_name,
+		const char *cells_name,
+		int cell_count)
+{
+	const __be32 *list;
+	int size;
+
+	memset(it, 0, sizeof(*it));
+
+	list = of_get_property(np, list_name, &size);
+	if (!list)
+		return -ENOENT;
+
+	it->cells_name = cells_name;
+	it->cell_count = cell_count;
+	it->parent = np;
+	it->list_end = list + size / sizeof(*list);
+	it->phandle_end = list;
+	it->cur = list;
+
+	return 0;
+}
+
 static int __of_parse_phandle_with_args(const struct device_node *np,
 					const char *list_name,
 					const char *cells_name,
 					int cell_count, int index,
 					struct of_phandle_args *out_args)
 {
-	const __be32 *list, *list_end;
-	int rc = 0, size, cur_index = 0;
-	uint32_t count = 0;
-	struct device_node *node = NULL;
-	phandle phandle;
+	struct of_phandle_iterator it;
+	int rc, cur_index = 0;
 
-	/* Retrieve the phandle list property */
-	list = of_get_property(np, list_name, &size);
-	if (!list)
-		return -ENOENT;
-	list_end = list + size / sizeof(*list);
+	rc = of_phandle_iterator_init(&it, np, list_name,
+				      cells_name, cell_count);
+	if (rc)
+		return rc;
 
 	/* Loop over the phandles until all the requested entry is found */
-	while (list < list_end) {
+	while (it.cur < it.list_end) {
 		rc = -EINVAL;
-		count = 0;
+		it.cur_count = 0;
 
 		/*
 		 * If phandle is 0, then it is an empty entry with no
 		 * arguments.  Skip forward to the next entry.
 		 */
-		phandle = be32_to_cpup(list++);
-		if (phandle) {
+		it.phandle = be32_to_cpup(it.cur++);
+		if (it.phandle) {
 			/*
 			 * Find the provider node and parse the #*-cells
 			 * property to determine the argument length.
@@ -1478,34 +1499,34 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 			 * except when we're going to return the found node
 			 * below.
 			 */
-			if (cells_name || cur_index == index) {
-				node = of_find_node_by_phandle(phandle);
-				if (!node) {
+			if (it.cells_name || cur_index == index) {
+				it.node = of_find_node_by_phandle(it.phandle);
+				if (!it.node) {
 					pr_err("%s: could not find phandle\n",
-						np->full_name);
+						it.parent->full_name);
 					goto err;
 				}
 			}
 
-			if (cells_name) {
-				if (of_property_read_u32(node, cells_name,
-							 &count)) {
+			if (it.cells_name) {
+				if (of_property_read_u32(it.node, it.cells_name,
+							 &it.cur_count)) {
 					pr_err("%s: could not get %s for %s\n",
-						np->full_name, cells_name,
-						node->full_name);
+						it.parent->full_name, it.cells_name,
+						it.node->full_name);
 					goto err;
 				}
 			} else {
-				count = cell_count;
+				it.cur_count = it.cell_count;
 			}
 
 			/*
 			 * Make sure that the arguments actually fit in the
 			 * remaining property data length
 			 */
-			if (list + count > list_end) {
+			if (it.cur + it.cur_count > it.list_end) {
 				pr_err("%s: arguments longer than property\n",
-					 np->full_name);
+					 it.parent->full_name);
 				goto err;
 			}
 		}
@@ -1518,28 +1539,28 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 		 */
 		rc = -ENOENT;
 		if (cur_index == index) {
-			if (!phandle)
+			if (!it.phandle)
 				goto err;
 
 			if (out_args) {
 				int i;
-				if (WARN_ON(count > MAX_PHANDLE_ARGS))
-					count = MAX_PHANDLE_ARGS;
-				out_args->np = node;
-				out_args->args_count = count;
-				for (i = 0; i < count; i++)
-					out_args->args[i] = be32_to_cpup(list++);
+				if (WARN_ON(it.cur_count > MAX_PHANDLE_ARGS))
+					it.cur_count = MAX_PHANDLE_ARGS;
+				out_args->np = it.node;
+				out_args->args_count = it.cur_count;
+				for (i = 0; i < it.cur_count; i++)
+					out_args->args[i] = be32_to_cpup(it.cur++);
 			} else {
-				of_node_put(node);
+				of_node_put(it.node);
 			}
 
 			/* Found it! return success */
 			return 0;
 		}
 
-		of_node_put(node);
-		node = NULL;
-		list += count;
+		of_node_put(it.node);
+		it.node = NULL;
+		it.cur += it.cur_count;
 		cur_index++;
 	}
 
@@ -1551,8 +1572,8 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 	 */
 	rc = index < 0 ? cur_index : -ENOENT;
  err:
-	if (node)
-		of_node_put(node);
+	if (it.node)
+		of_node_put(it.node);
 	return rc;
 }
 
diff --git a/include/linux/of.h b/include/linux/of.h
index 7fcb681baadf..0f187dbb890b 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -75,6 +75,23 @@ struct of_phandle_args {
 	uint32_t args[MAX_PHANDLE_ARGS];
 };
 
+struct of_phandle_iterator {
+	/* Common iterator information */
+	const char *cells_name;
+	int cell_count;
+	const struct device_node *parent;
+
+	/* List size information */
+	const __be32 *list_end;
+	const __be32 *phandle_end;
+
+	/* Current position state */
+	const __be32 *cur;
+	uint32_t cur_count;
+	phandle phandle;
+	struct device_node *node;
+};
+
 struct of_reconfig_data {
 	struct device_node	*dn;
 	struct property		*prop;
@@ -334,6 +351,13 @@ extern int of_parse_phandle_with_fixed_args(const struct device_node *np,
 extern int of_count_phandle_with_args(const struct device_node *np,
 	const char *list_name, const char *cells_name);
 
+/* phandle iterator functions */
+extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
+				    const struct device_node *np,
+				    const char *list_name,
+				    const char *cells_name,
+				    int cell_count);
+
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
 extern int of_alias_get_id(struct device_node *np, const char *stem);
 extern int of_alias_get_highest_id(const char *stem);
@@ -608,6 +632,15 @@ static inline int of_count_phandle_with_args(struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
+					   const struct device_node *np,
+					   const char *list_name,
+					   const char *cells_name,
+					   int cell_count)
+{
+	return -ENOSYS;
+}
+
 static inline int of_alias_get_id(struct device_node *np, const char *stem)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From cd209b412c8a5d632b51af1e45576f0d00b8105f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:18 +0200
Subject: of: Move phandle walking to of_phandle_iterator_next()

Move the code to walk over the phandles out of the loop in
__of_parse_phandle_with_args() to a separate function that
just works with the iterator handle: of_phandle_iterator_next().

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 130 ++++++++++++++++++++++++++++++-----------------------
 include/linux/of.h |   7 +++
 2 files changed, 81 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 1c6f43b5737d..69286ec206f7 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1465,6 +1465,75 @@ int of_phandle_iterator_init(struct of_phandle_iterator *it,
 	return 0;
 }
 
+int of_phandle_iterator_next(struct of_phandle_iterator *it)
+{
+	uint32_t count = 0;
+
+	if (it->node) {
+		of_node_put(it->node);
+		it->node = NULL;
+	}
+
+	if (!it->cur || it->phandle_end >= it->list_end)
+		return -ENOENT;
+
+	it->cur = it->phandle_end;
+
+	/* If phandle is 0, then it is an empty entry with no arguments. */
+	it->phandle = be32_to_cpup(it->cur++);
+
+	if (it->phandle) {
+
+		/*
+		 * Find the provider node and parse the #*-cells property to
+		 * determine the argument length.
+		 */
+		it->node = of_find_node_by_phandle(it->phandle);
+
+		if (it->cells_name) {
+			if (!it->node) {
+				pr_err("%s: could not find phandle\n",
+				       it->parent->full_name);
+				goto err;
+			}
+
+			if (of_property_read_u32(it->node, it->cells_name,
+						 &count)) {
+				pr_err("%s: could not get %s for %s\n",
+				       it->parent->full_name,
+				       it->cells_name,
+				       it->node->full_name);
+				goto err;
+			}
+		} else {
+			count = it->cell_count;
+		}
+
+		/*
+		 * Make sure that the arguments actually fit in the remaining
+		 * property data length
+		 */
+		if (it->cur + count > it->list_end) {
+			pr_err("%s: arguments longer than property\n",
+			       it->parent->full_name);
+			goto err;
+		}
+	}
+
+	it->phandle_end = it->cur + count;
+	it->cur_count = count;
+
+	return 0;
+
+err:
+	if (it->node) {
+		of_node_put(it->node);
+		it->node = NULL;
+	}
+
+	return -EINVAL;
+}
+
 static int __of_parse_phandle_with_args(const struct device_node *np,
 					const char *list_name,
 					const char *cells_name,
@@ -1480,59 +1549,9 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 		return rc;
 
 	/* Loop over the phandles until all the requested entry is found */
-	while (it.cur < it.list_end) {
-		rc = -EINVAL;
-		it.cur_count = 0;
-
-		/*
-		 * If phandle is 0, then it is an empty entry with no
-		 * arguments.  Skip forward to the next entry.
-		 */
-		it.phandle = be32_to_cpup(it.cur++);
-		if (it.phandle) {
-			/*
-			 * Find the provider node and parse the #*-cells
-			 * property to determine the argument length.
-			 *
-			 * This is not needed if the cell count is hard-coded
-			 * (i.e. cells_name not set, but cell_count is set),
-			 * except when we're going to return the found node
-			 * below.
-			 */
-			if (it.cells_name || cur_index == index) {
-				it.node = of_find_node_by_phandle(it.phandle);
-				if (!it.node) {
-					pr_err("%s: could not find phandle\n",
-						it.parent->full_name);
-					goto err;
-				}
-			}
-
-			if (it.cells_name) {
-				if (of_property_read_u32(it.node, it.cells_name,
-							 &it.cur_count)) {
-					pr_err("%s: could not get %s for %s\n",
-						it.parent->full_name, it.cells_name,
-						it.node->full_name);
-					goto err;
-				}
-			} else {
-				it.cur_count = it.cell_count;
-			}
-
-			/*
-			 * Make sure that the arguments actually fit in the
-			 * remaining property data length
-			 */
-			if (it.cur + it.cur_count > it.list_end) {
-				pr_err("%s: arguments longer than property\n",
-					 it.parent->full_name);
-				goto err;
-			}
-		}
-
+	while ((rc = of_phandle_iterator_next(&it)) == 0) {
 		/*
-		 * All of the error cases above bail out of the loop, so at
+		 * All of the error cases bail out of the loop, so at
 		 * this point, the parsing is successful. If the requested
 		 * index matches, then fill the out_args structure and return,
 		 * or return -ENOENT for an empty entry.
@@ -1558,9 +1577,6 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 			return 0;
 		}
 
-		of_node_put(it.node);
-		it.node = NULL;
-		it.cur += it.cur_count;
 		cur_index++;
 	}
 
@@ -1570,7 +1586,9 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 	 * -EINVAL : parsing error on data
 	 * [1..n]  : Number of phandle (count mode; when index = -1)
 	 */
-	rc = index < 0 ? cur_index : -ENOENT;
+	if (rc == -ENOENT && index < 0)
+		rc = cur_index;
+
  err:
 	if (it.node)
 		of_node_put(it.node);
diff --git a/include/linux/of.h b/include/linux/of.h
index 0f187dbb890b..1f5e108f6716 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -358,6 +358,8 @@ extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
 				    const char *cells_name,
 				    int cell_count);
 
+extern int of_phandle_iterator_next(struct of_phandle_iterator *it);
+
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
 extern int of_alias_get_id(struct device_node *np, const char *stem);
 extern int of_alias_get_highest_id(const char *stem);
@@ -641,6 +643,11 @@ static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
 	return -ENOSYS;
 }
 
+static inline int of_phandle_iterator_next(struct of_phandle_iterator *it)
+{
+	return -ENOSYS;
+}
+
 static inline int of_alias_get_id(struct device_node *np, const char *stem)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From f623ce95a51baee6a6638f0b025efc0229a9ac0d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:20 +0200
Subject: of: Introduce of_for_each_phandle() helper macro

With this macro any user can easily iterate over a list of
phandles. The patch also converts __of_parse_phandle_with_args()
to make use of the macro.

The of_count_phandle_with_args() function is not converted,
because the macro hides the return value of of_phandle_iterator_init(),
which is needed in there.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 7 +------
 include/linux/of.h | 6 ++++++
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index fcff2b62ec10..ea5a13d3c5a5 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1543,13 +1543,8 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 	struct of_phandle_iterator it;
 	int rc, cur_index = 0;
 
-	rc = of_phandle_iterator_init(&it, np, list_name,
-				      cells_name, cell_count);
-	if (rc)
-		return rc;
-
 	/* Loop over the phandles until all the requested entry is found */
-	while ((rc = of_phandle_iterator_next(&it)) == 0) {
+	of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) {
 		/*
 		 * All of the error cases bail out of the loop, so at
 		 * this point, the parsing is successful. If the requested
diff --git a/include/linux/of.h b/include/linux/of.h
index 1f5e108f6716..b0b80716fbfb 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -908,6 +908,12 @@ static inline int of_property_read_s32(const struct device_node *np,
 	return of_property_read_u32(np, propname, (u32*) out_value);
 }
 
+#define of_for_each_phandle(it, err, np, ln, cn, cc)			\
+	for (of_phandle_iterator_init((it), (np), (ln), (cn), (cc)),	\
+	     err = of_phandle_iterator_next(it);			\
+	     err == 0;							\
+	     err = of_phandle_iterator_next(it))
+
 #define of_property_for_each_u32(np, propname, prop, p, u)	\
 	for (prop = of_find_property(np, propname, NULL),	\
 		p = of_prop_next_u32(prop, NULL, &u);		\
-- 
cgit v1.2.3


From abdaa77b18480361f3565d958a2acffad268c39c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:21 +0200
Subject: of: Introduce of_phandle_iterator_args()

This helper function can be used to copy the arguments of a
phandle to an array.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 29 +++++++++++++++++++++++------
 include/linux/of.h | 10 ++++++++++
 2 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index ea5a13d3c5a5..e87e21df19d8 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1534,6 +1534,23 @@ err:
 	return -EINVAL;
 }
 
+int of_phandle_iterator_args(struct of_phandle_iterator *it,
+			     uint32_t *args,
+			     int size)
+{
+	int i, count;
+
+	count = it->cur_count;
+
+	if (WARN_ON(size < count))
+		count = size;
+
+	for (i = 0; i < count; i++)
+		args[i] = be32_to_cpup(it->cur++);
+
+	return count;
+}
+
 static int __of_parse_phandle_with_args(const struct device_node *np,
 					const char *list_name,
 					const char *cells_name,
@@ -1557,13 +1574,13 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 				goto err;
 
 			if (out_args) {
-				int i;
-				if (WARN_ON(it.cur_count > MAX_PHANDLE_ARGS))
-					it.cur_count = MAX_PHANDLE_ARGS;
+				int c;
+
+				c = of_phandle_iterator_args(&it,
+							     out_args->args,
+							     MAX_PHANDLE_ARGS);
 				out_args->np = it.node;
-				out_args->args_count = it.cur_count;
-				for (i = 0; i < it.cur_count; i++)
-					out_args->args[i] = be32_to_cpup(it.cur++);
+				out_args->args_count = c;
 			} else {
 				of_node_put(it.node);
 			}
diff --git a/include/linux/of.h b/include/linux/of.h
index b0b80716fbfb..71e1c35a5960 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -359,6 +359,9 @@ extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
 				    int cell_count);
 
 extern int of_phandle_iterator_next(struct of_phandle_iterator *it);
+extern int of_phandle_iterator_args(struct of_phandle_iterator *it,
+				    uint32_t *args,
+				    int size);
 
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
 extern int of_alias_get_id(struct device_node *np, const char *stem);
@@ -648,6 +651,13 @@ static inline int of_phandle_iterator_next(struct of_phandle_iterator *it)
 	return -ENOSYS;
 }
 
+static inline int of_phandle_iterator_args(struct of_phandle_iterator *it,
+					   uint32_t *args,
+					   int size)
+{
+	return 0;
+}
+
 static inline int of_alias_get_id(struct device_node *np, const char *stem)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From d0bad49bb0a094a1beb06640785f95cb256b7272 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:55 -0600
Subject: tracing: Add enable_hist/disable_hist triggers

Similar to enable_event/disable_event triggers, these triggers enable
and disable the aggregation of events into maps rather than enabling
and disabling their writing into the trace buffer.

They can be used to automatically start and stop hist triggers based
on a matching filter condition.

If there's a paused hist trigger on system:event, the following would
start it when the filter condition was hit:

  # echo enable_hist:system:event [ if filter] > event/trigger

And the following would disable a running system:event hist trigger:

  # echo disable_hist:system:event [ if filter] > event/trigger

See Documentation/trace/events.txt for real examples.

Link: http://lkml.kernel.org/r/f812f086e52c8b7c8ad5443487375e03c96a601f.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |   1 +
 kernel/trace/trace.c                |   8 +++
 kernel/trace/trace.h                |  32 ++++++++++
 kernel/trace/trace_events_hist.c    | 115 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |  71 ++++++++++++----------
 5 files changed, 196 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 404603720650..5f89a5b0c7e6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -408,6 +408,7 @@ enum event_trigger_type {
 	ETT_STACKTRACE		= (1 << 2),
 	ETT_EVENT_ENABLE	= (1 << 3),
 	ETT_EVENT_HIST		= (1 << 4),
+	ETT_HIST_ENABLE		= (1 << 5),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2238bfde799b..8430145bea12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3807,6 +3807,10 @@ static const char readme_msg[] =
 	"\t   trigger: traceon, traceoff\n"
 	"\t            enable_event:<system>:<event>\n"
 	"\t            disable_event:<system>:<event>\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t            enable_hist:<system>:<event>\n"
+	"\t            disable_hist:<system>:<event>\n"
+#endif
 #ifdef CONFIG_STACKTRACE
 	"\t\t    stacktrace\n"
 #endif
@@ -3867,6 +3871,10 @@ static const char readme_msg[] =
 	"\t    The 'clear' parameter will clear the contents of a running\n"
 	"\t    hist trigger and leave its current paused/active state\n"
 	"\t    unchanged.\n\n"
+	"\t    The enable_hist and disable_hist triggers can be used to\n"
+	"\t    have one event conditionally start and stop another event's\n"
+	"\t    already-attached hist trigger.  The syntax is analagous to\n"
+	"\t    the enable_event and disable_event triggers.\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 505f8a45f426..cab1f4bfe85b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1166,8 +1166,10 @@ extern const struct file_operations event_hist_fops;
 
 #ifdef CONFIG_HIST_TRIGGERS
 extern int register_trigger_hist_cmd(void);
+extern int register_trigger_hist_enable_disable_cmds(void);
 #else
 static inline int register_trigger_hist_cmd(void) { return 0; }
+static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
 #endif
 
 extern int register_trigger_cmds(void);
@@ -1185,6 +1187,34 @@ struct event_trigger_data {
 	struct list_head		list;
 };
 
+/* Avoid typos */
+#define ENABLE_EVENT_STR	"enable_event"
+#define DISABLE_EVENT_STR	"disable_event"
+#define ENABLE_HIST_STR		"enable_hist"
+#define DISABLE_HIST_STR	"disable_hist"
+
+struct enable_trigger_data {
+	struct trace_event_file		*file;
+	bool				enable;
+	bool				hist;
+};
+
+extern int event_enable_trigger_print(struct seq_file *m,
+				      struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern int event_enable_trigger_func(struct event_command *cmd_ops,
+				     struct trace_event_file *file,
+				     char *glob, char *cmd, char *param);
+extern int event_enable_register_trigger(char *glob,
+					 struct event_trigger_ops *ops,
+					 struct event_trigger_data *data,
+					 struct trace_event_file *file);
+extern void event_enable_unregister_trigger(char *glob,
+					    struct event_trigger_ops *ops,
+					    struct event_trigger_data *test,
+					    struct trace_event_file *file);
 extern void trigger_data_free(struct event_trigger_data *data);
 extern int event_trigger_init(struct event_trigger_ops *ops,
 			      struct event_trigger_data *data);
@@ -1198,6 +1228,8 @@ extern int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
 			      struct trace_event_file *file);
 extern int register_event_command(struct event_command *cmd);
+extern int unregister_event_command(struct event_command *cmd);
+extern int register_trigger_hist_enable_disable_cmds(void);
 
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4f4041d76926..5d4f02792440 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1393,3 +1393,118 @@ __init int register_trigger_hist_cmd(void)
 
 	return ret;
 }
+
+static void
+hist_enable_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct enable_trigger_data *enable_data = data->private_data;
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (enable_data->enable)
+				test->paused = false;
+			else
+				test->paused = true;
+			break;
+		}
+	}
+}
+
+static void
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+{
+	if (!data->count)
+		return;
+
+	if (data->count != -1)
+		(data->count)--;
+
+	hist_enable_trigger(data, rec);
+}
+
+static struct event_trigger_ops hist_enable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_enable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops *
+hist_enable_get_trigger_ops(char *cmd, char *param)
+{
+	struct event_trigger_ops *ops;
+	bool enable;
+
+	enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
+
+	if (enable)
+		ops = param ? &hist_enable_count_trigger_ops :
+			&hist_enable_trigger_ops;
+	else
+		ops = param ? &hist_disable_count_trigger_ops :
+			&hist_disable_trigger_ops;
+
+	return ops;
+}
+
+static struct event_command trigger_hist_enable_cmd = {
+	.name			= ENABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static struct event_command trigger_hist_disable_cmd = {
+	.name			= DISABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static __init void unregister_trigger_hist_enable_disable_cmds(void)
+{
+	unregister_event_command(&trigger_hist_enable_cmd);
+	unregister_event_command(&trigger_hist_disable_cmd);
+}
+
+__init int register_trigger_hist_enable_disable_cmds(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_enable_cmd);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = register_event_command(&trigger_hist_disable_cmd);
+	if (WARN_ON(ret < 0))
+		unregister_trigger_hist_enable_disable_cmds();
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d29092afe005..d133f2094566 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd)
  * Currently we only unregister event commands from __init, so mark
  * this __init too.
  */
-static __init int unregister_event_command(struct event_command *cmd)
+__init int unregister_event_command(struct event_command *cmd)
 {
 	struct event_command *p, *n;
 	int ret = -ENODEV;
@@ -1062,15 +1062,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 	unregister_event_command(&trigger_traceoff_cmd);
 }
 
-/* Avoid typos */
-#define ENABLE_EVENT_STR	"enable_event"
-#define DISABLE_EVENT_STR	"disable_event"
-
-struct enable_trigger_data {
-	struct trace_event_file		*file;
-	bool				enable;
-};
-
 static void
 event_enable_trigger(struct event_trigger_data *data, void *rec)
 {
@@ -1100,14 +1091,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
 	event_enable_trigger(data, rec);
 }
 
-static int
-event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-			   struct event_trigger_data *data)
+int event_enable_trigger_print(struct seq_file *m,
+			       struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
 	seq_printf(m, "%s:%s:%s",
-		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+		   enable_data->hist ?
+		   (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) :
+		   (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR),
 		   enable_data->file->event_call->class->system,
 		   trace_event_name(enable_data->file->event_call));
 
@@ -1124,9 +1117,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	return 0;
 }
 
-static void
-event_enable_trigger_free(struct event_trigger_ops *ops,
-			  struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1171,10 +1163,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 	.free			= event_enable_trigger_free,
 };
 
-static int
-event_enable_trigger_func(struct event_command *cmd_ops,
-			  struct trace_event_file *file,
-			  char *glob, char *cmd, char *param)
+int event_enable_trigger_func(struct event_command *cmd_ops,
+			      struct trace_event_file *file,
+			      char *glob, char *cmd, char *param)
 {
 	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
@@ -1183,6 +1174,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	struct trace_array *tr = file->tr;
 	const char *system;
 	const char *event;
+	bool hist = false;
 	char *trigger;
 	char *number;
 	bool enable;
@@ -1207,8 +1199,15 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	if (!event_enable_file)
 		goto out;
 
-	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#ifdef CONFIG_HIST_TRIGGERS
+	hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
+		(strcmp(cmd, DISABLE_HIST_STR) == 0));
 
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
+	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#endif
 	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
 
 	ret = -ENOMEM;
@@ -1228,6 +1227,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	INIT_LIST_HEAD(&trigger_data->list);
 	RCU_INIT_POINTER(trigger_data->filter, NULL);
 
+	enable_data->hist = hist;
 	enable_data->enable = enable;
 	enable_data->file = event_enable_file;
 	trigger_data->private_data = enable_data;
@@ -1305,10 +1305,10 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	goto out;
 }
 
-static int event_enable_register_trigger(char *glob,
-					 struct event_trigger_ops *ops,
-					 struct event_trigger_data *data,
-					 struct trace_event_file *file)
+int event_enable_register_trigger(char *glob,
+				  struct event_trigger_ops *ops,
+				  struct event_trigger_data *data,
+				  struct trace_event_file *file)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct enable_trigger_data *test_enable_data;
@@ -1318,6 +1318,8 @@ static int event_enable_register_trigger(char *glob,
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		test_enable_data = test->private_data;
 		if (test_enable_data &&
+		    (test->cmd_ops->trigger_type ==
+		     data->cmd_ops->trigger_type) &&
 		    (test_enable_data->file == enable_data->file)) {
 			ret = -EEXIST;
 			goto out;
@@ -1343,10 +1345,10 @@ out:
 	return ret;
 }
 
-static void event_enable_unregister_trigger(char *glob,
-					    struct event_trigger_ops *ops,
-					    struct event_trigger_data *test,
-					    struct trace_event_file *file)
+void event_enable_unregister_trigger(char *glob,
+				     struct event_trigger_ops *ops,
+				     struct event_trigger_data *test,
+				     struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
 	struct enable_trigger_data *enable_data;
@@ -1356,6 +1358,8 @@ static void event_enable_unregister_trigger(char *glob,
 	list_for_each_entry_rcu(data, &file->triggers, list) {
 		enable_data = data->private_data;
 		if (enable_data &&
+		    (data->cmd_ops->trigger_type ==
+		     test->cmd_ops->trigger_type) &&
 		    (enable_data->file == test_enable_data->file)) {
 			unregistered = true;
 			list_del_rcu(&data->list);
@@ -1375,8 +1379,12 @@ event_enable_get_trigger_ops(char *cmd, char *param)
 	struct event_trigger_ops *ops;
 	bool enable;
 
+#ifdef CONFIG_HIST_TRIGGERS
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
 	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-
+#endif
 	if (enable)
 		ops = param ? &event_enable_count_trigger_ops :
 			&event_enable_trigger_ops;
@@ -1447,6 +1455,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_enable_disable_cmds();
 	register_trigger_hist_cmd();
 
 	return 0;
-- 
cgit v1.2.3


From c1d61c9bb163e696bf06850bcabbd26386554489 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 31 Mar 2016 16:34:32 -0600
Subject: PCI: Reverse standard ACS vs device-specific ACS enabling

The original thought was that if a device implemented ACS, then surely
we want to use that... well, it turns out that devices can make an ACS
capability so broken that we still need to fall back to quirks.

Reverse the order of ACS enabling to give quirks first shot at it.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c    | 10 ++++------
 drivers/pci/quirks.c |  6 ++++--
 include/linux/pci.h  |  7 +++++--
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..c98c4e2aed3c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2547,7 +2547,7 @@ void pci_request_acs(void)
  * pci_std_enable_acs - enable ACS on devices using standard ACS capabilites
  * @dev: the PCI device
  */
-static int pci_std_enable_acs(struct pci_dev *dev)
+static void pci_std_enable_acs(struct pci_dev *dev)
 {
 	int pos;
 	u16 cap;
@@ -2555,7 +2555,7 @@ static int pci_std_enable_acs(struct pci_dev *dev)
 
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS);
 	if (!pos)
-		return -ENODEV;
+		return;
 
 	pci_read_config_word(dev, pos + PCI_ACS_CAP, &cap);
 	pci_read_config_word(dev, pos + PCI_ACS_CTRL, &ctrl);
@@ -2573,8 +2573,6 @@ static int pci_std_enable_acs(struct pci_dev *dev)
 	ctrl |= (cap & PCI_ACS_UF);
 
 	pci_write_config_word(dev, pos + PCI_ACS_CTRL, ctrl);
-
-	return 0;
 }
 
 /**
@@ -2586,10 +2584,10 @@ void pci_enable_acs(struct pci_dev *dev)
 	if (!pci_acs_enable)
 		return;
 
-	if (!pci_std_enable_acs(dev))
+	if (!pci_dev_specific_enable_acs(dev))
 		return;
 
-	pci_dev_specific_enable_acs(dev);
+	pci_std_enable_acs(dev);
 }
 
 static bool pci_acs_flags_enabled(struct pci_dev *pdev, u16 acs_flags)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e248c2aad000..1f5c7898a246 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4201,7 +4201,7 @@ static const struct pci_dev_enable_acs {
 	{ 0 }
 };
 
-void pci_dev_specific_enable_acs(struct pci_dev *dev)
+int pci_dev_specific_enable_acs(struct pci_dev *dev)
 {
 	const struct pci_dev_enable_acs *i;
 	int ret;
@@ -4213,9 +4213,11 @@ void pci_dev_specific_enable_acs(struct pci_dev *dev)
 		     i->device == (u16)PCI_ANY_ID)) {
 			ret = i->enable_acs(dev);
 			if (ret >= 0)
-				return;
+				return ret;
 		}
 	}
+
+	return -ENOTTY;
 }
 
 /*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..aaec79aee805 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1663,7 +1663,7 @@ enum pci_fixup_pass {
 #ifdef CONFIG_PCI_QUIRKS
 void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
 int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
-void pci_dev_specific_enable_acs(struct pci_dev *dev);
+int pci_dev_specific_enable_acs(struct pci_dev *dev);
 #else
 static inline void pci_fixup_device(enum pci_fixup_pass pass,
 				    struct pci_dev *dev) { }
@@ -1672,7 +1672,10 @@ static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
 {
 	return -ENOTTY;
 }
-static inline void pci_dev_specific_enable_acs(struct pci_dev *dev) { }
+static inline int pci_dev_specific_enable_acs(struct pci_dev *dev)
+{
+	return -ENOTTY;
+}
 #endif
 
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
-- 
cgit v1.2.3


From a14b9e0512404ed7d4415b888dc9f1f9785a4fa3 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 5 Feb 2016 16:40:47 -0800
Subject: clkdev: Remove clk_register_clkdevs()

Now that we've converted the only caller over to another clkdev
API, remove this one.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clkdev.c   | 27 ---------------------------
 include/linux/clkdev.h |  1 -
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index eb20b941154b..ae8e40a82d34 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -402,30 +402,3 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 	return cl ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL(clk_register_clkdev);
-
-/**
- * clk_register_clkdevs - register a set of clk_lookup for a struct clk
- * @clk: struct clk to associate with all clk_lookups
- * @cl: array of clk_lookup structures with con_id and dev_id pre-initialized
- * @num: number of clk_lookup structures to register
- *
- * To make things easier for mass registration, we detect error clks
- * from a previous clk_register() call, and return the error code for
- * those.  This is to permit this function to be called immediately
- * after clk_register().
- */
-int clk_register_clkdevs(struct clk *clk, struct clk_lookup *cl, size_t num)
-{
-	unsigned i;
-
-	if (IS_ERR(clk))
-		return PTR_ERR(clk);
-
-	for (i = 0; i < num; i++, cl++) {
-		cl->clk_hw = __clk_get_hw(clk);
-		__clkdev_add(cl);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(clk_register_clkdevs);
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index c2c04f7cbe8a..e6f8eb1d585f 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -45,7 +45,6 @@ void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
 int clk_register_clkdev(struct clk *, const char *, const char *);
-int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t);
 
 #ifdef CONFIG_COMMON_CLK
 int __clk_get(struct clk *clk);
-- 
cgit v1.2.3


From 4143804c4fdef40358c654d1fb2271a1a0f1fedf Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 5 Feb 2016 17:02:52 -0800
Subject: clk: Add {devm_}clk_hw_{register,unregister}() APIs

We've largely split the clk consumer and provider APIs along
struct clk and struct clk_hw, but clk_register() still returns a
struct clk pointer for each struct clk_hw that's registered.
Eventually we'd like to only allocate struct clks when there's a
user, because struct clk is per-user now, so clk_register() needs
to change.

Let's add new APIs to register struct clk_hws, but this time
we'll hide the struct clk from the caller by returning an int
error code. Also add an unregistration API that takes the clk_hw
structure that was passed to the registration API. This way
provider drivers never have to deal with a struct clk pointer
unless they're using the clk consumer APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 Documentation/driver-model/devres.txt |  1 +
 drivers/clk/clk.c                     | 86 +++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h          |  6 +++
 3 files changed, 93 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 73b98dfbcea4..108d45553e1b 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -236,6 +236,7 @@ certainly invest a bit more effort into libata core layer).
 CLOCK
   devm_clk_get()
   devm_clk_put()
+  devm_clk_hw_register()
 
 DMA
   dmam_alloc_coherent()
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index fb74dc1f7520..0ef919666827 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2536,6 +2536,22 @@ fail_out:
 }
 EXPORT_SYMBOL_GPL(clk_register);
 
+/**
+ * clk_hw_register - register a clk_hw and return an error code
+ * @dev: device that is registering this clock
+ * @hw: link to hardware-specific clock data
+ *
+ * clk_hw_register is the primary interface for populating the clock tree with
+ * new clock nodes. It returns an integer equal to zero indicating success or
+ * less than zero indicating failure. Drivers must test for an error code after
+ * calling clk_hw_register().
+ */
+int clk_hw_register(struct device *dev, struct clk_hw *hw)
+{
+	return PTR_ERR_OR_ZERO(clk_register(dev, hw));
+}
+EXPORT_SYMBOL_GPL(clk_hw_register);
+
 /* Free memory allocated for a clock. */
 static void __clk_release(struct kref *ref)
 {
@@ -2637,11 +2653,26 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(clk_unregister);
 
+/**
+ * clk_hw_unregister - unregister a currently registered clk_hw
+ * @hw: hardware-specific clock data to unregister
+ */
+void clk_hw_unregister(struct clk_hw *hw)
+{
+	clk_unregister(hw->clk);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister);
+
 static void devm_clk_release(struct device *dev, void *res)
 {
 	clk_unregister(*(struct clk **)res);
 }
 
+static void devm_clk_hw_release(struct device *dev, void *res)
+{
+	clk_hw_unregister(*(struct clk_hw **)res);
+}
+
 /**
  * devm_clk_register - resource managed clk_register()
  * @dev: device that is registering this clock
@@ -2672,6 +2703,36 @@ struct clk *devm_clk_register(struct device *dev, struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(devm_clk_register);
 
+/**
+ * devm_clk_hw_register - resource managed clk_hw_register()
+ * @dev: device that is registering this clock
+ * @hw: link to hardware-specific clock data
+ *
+ * Managed clk_hw_register(). Clocks returned from this function are
+ * automatically clk_hw_unregister()ed on driver detach. See clk_hw_register()
+ * for more information.
+ */
+int devm_clk_hw_register(struct device *dev, struct clk_hw *hw)
+{
+	struct clk_hw **hwp;
+	int ret;
+
+	hwp = devres_alloc(devm_clk_hw_release, sizeof(*hwp), GFP_KERNEL);
+	if (!hwp)
+		return -ENOMEM;
+
+	ret = clk_hw_register(dev, hw);
+	if (!ret) {
+		*hwp = hw;
+		devres_add(dev, hwp);
+	} else {
+		devres_free(hwp);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_register);
+
 static int devm_clk_match(struct device *dev, void *res, void *data)
 {
 	struct clk *c = res;
@@ -2680,6 +2741,15 @@ static int devm_clk_match(struct device *dev, void *res, void *data)
 	return c == data;
 }
 
+static int devm_clk_hw_match(struct device *dev, void *res, void *data)
+{
+	struct clk_hw *hw = res;
+
+	if (WARN_ON(!hw))
+		return 0;
+	return hw == data;
+}
+
 /**
  * devm_clk_unregister - resource managed clk_unregister()
  * @clk: clock to unregister
@@ -2694,6 +2764,22 @@ void devm_clk_unregister(struct device *dev, struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(devm_clk_unregister);
 
+/**
+ * devm_clk_hw_unregister - resource managed clk_hw_unregister()
+ * @dev: device that is unregistering the hardware-specific clock data
+ * @hw: link to hardware-specific clock data
+ *
+ * Unregister a clk_hw registered with devm_clk_hw_register(). Normally
+ * this function will not need to be called and the resource management
+ * code will ensure that the resource is freed.
+ */
+void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw)
+{
+	WARN_ON(devres_release(dev, devm_clk_hw_release, devm_clk_hw_match,
+				hw));
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_unregister);
+
 /*
  * clkdev helpers
  */
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index da95258127aa..bc6c8de1fac1 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -655,9 +655,15 @@ struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
 struct clk *clk_register(struct device *dev, struct clk_hw *hw);
 struct clk *devm_clk_register(struct device *dev, struct clk_hw *hw);
 
+int __must_check clk_hw_register(struct device *dev, struct clk_hw *hw);
+int __must_check devm_clk_hw_register(struct device *dev, struct clk_hw *hw);
+
 void clk_unregister(struct clk *clk);
 void devm_clk_unregister(struct device *dev, struct clk *clk);
 
+void clk_hw_unregister(struct clk_hw *hw);
+void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
+
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
 const char *clk_hw_get_name(const struct clk_hw *hw);
-- 
cgit v1.2.3


From 0861e5b8cf80038e91942f1005c8dfce79d18c38 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 5 Feb 2016 17:38:26 -0800
Subject: clk: Add clk_hw OF clk providers

Now that we have a clk registration API that doesn't return
struct clks, we need to have some way to hand out struct clks via
the clk_get() APIs that doesn't involve associating struct clk
pointers with an OF node. Currently we ask the OF provider to
give us a struct clk pointer for some clkspec, turn that struct
clk into a struct clk_hw and then allocate a new struct clk to
return to the caller.

Let's add a clk_hw based OF provider hook that returns a struct
clk_hw directly, so that we skip the intermediate step of
converting from struct clk to struct clk_hw. Eventually when
we've converted all OF clk providers to struct clk_hw based APIs
we can remove the struct clk based ones.

It should also be noted that we change the onecell provider to
have a flex array instead of a pointer for the array of clk_hw
pointers. This allows providers to allocate one structure of the
correct length in one step instead of two.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 85 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/clk-provider.h | 30 ++++++++++++++++
 2 files changed, 111 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 0ef919666827..e813b2aabc87 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2941,6 +2941,7 @@ struct of_clk_provider {
 
 	struct device_node *node;
 	struct clk *(*get)(struct of_phandle_args *clkspec, void *data);
+	struct clk_hw *(*get_hw)(struct of_phandle_args *clkspec, void *data);
 	void *data;
 };
 
@@ -2957,6 +2958,12 @@ struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 }
 EXPORT_SYMBOL_GPL(of_clk_src_simple_get);
 
+struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec, void *data)
+{
+	return data;
+}
+EXPORT_SYMBOL_GPL(of_clk_hw_simple_get);
+
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data)
 {
 	struct clk_onecell_data *clk_data = data;
@@ -2971,6 +2978,21 @@ struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data)
 }
 EXPORT_SYMBOL_GPL(of_clk_src_onecell_get);
 
+struct clk_hw *
+of_clk_hw_onecell_get(struct of_phandle_args *clkspec, void *data)
+{
+	struct clk_hw_onecell_data *hw_data = data;
+	unsigned int idx = clkspec->args[0];
+
+	if (idx >= hw_data->num) {
+		pr_err("%s: invalid index %u\n", __func__, idx);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return hw_data->hws[idx];
+}
+EXPORT_SYMBOL_GPL(of_clk_hw_onecell_get);
+
 /**
  * of_clk_add_provider() - Register a clock provider for a node
  * @np: Device node pointer associated with clock provider
@@ -3006,6 +3028,41 @@ int of_clk_add_provider(struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(of_clk_add_provider);
 
+/**
+ * of_clk_add_hw_provider() - Register a clock provider for a node
+ * @np: Device node pointer associated with clock provider
+ * @get: callback for decoding clk_hw
+ * @data: context pointer for @get callback.
+ */
+int of_clk_add_hw_provider(struct device_node *np,
+			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+						 void *data),
+			   void *data)
+{
+	struct of_clk_provider *cp;
+	int ret;
+
+	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+	if (!cp)
+		return -ENOMEM;
+
+	cp->node = of_node_get(np);
+	cp->data = data;
+	cp->get_hw = get;
+
+	mutex_lock(&of_clk_mutex);
+	list_add(&cp->link, &of_clk_providers);
+	mutex_unlock(&of_clk_mutex);
+	pr_debug("Added clk_hw provider from %s\n", np->full_name);
+
+	ret = of_clk_set_defaults(np, true);
+	if (ret < 0)
+		of_clk_del_provider(np);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_clk_add_hw_provider);
+
 /**
  * of_clk_del_provider() - Remove a previously registered clock provider
  * @np: Device node pointer associated with clock provider
@@ -3027,11 +3084,32 @@ void of_clk_del_provider(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_clk_del_provider);
 
+static struct clk_hw *
+__of_clk_get_hw_from_provider(struct of_clk_provider *provider,
+			      struct of_phandle_args *clkspec)
+{
+	struct clk *clk;
+	struct clk_hw *hw = ERR_PTR(-EPROBE_DEFER);
+
+	if (provider->get_hw) {
+		hw = provider->get_hw(clkspec, provider->data);
+	} else if (provider->get) {
+		clk = provider->get(clkspec, provider->data);
+		if (!IS_ERR(clk))
+			hw = __clk_get_hw(clk);
+		else
+			hw = ERR_CAST(clk);
+	}
+
+	return hw;
+}
+
 struct clk *__of_clk_get_from_provider(struct of_phandle_args *clkspec,
 				       const char *dev_id, const char *con_id)
 {
 	struct of_clk_provider *provider;
 	struct clk *clk = ERR_PTR(-EPROBE_DEFER);
+	struct clk_hw *hw = ERR_PTR(-EPROBE_DEFER);
 
 	if (!clkspec)
 		return ERR_PTR(-EINVAL);
@@ -3040,10 +3118,9 @@ struct clk *__of_clk_get_from_provider(struct of_phandle_args *clkspec,
 	mutex_lock(&of_clk_mutex);
 	list_for_each_entry(provider, &of_clk_providers, link) {
 		if (provider->node == clkspec->np)
-			clk = provider->get(clkspec, provider->data);
-		if (!IS_ERR(clk)) {
-			clk = __clk_create_clk(__clk_get_hw(clk), dev_id,
-					       con_id);
+			hw = __of_clk_get_hw_from_provider(provider, clkspec);
+		if (!IS_ERR(hw)) {
+			clk = __clk_create_clk(hw, dev_id, con_id);
 
 			if (!IS_ERR(clk) && !__clk_get(clk)) {
 				__clk_free_clk(clk);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bc6c8de1fac1..bf8c8bb8c2cb 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -709,6 +709,11 @@ struct clk_onecell_data {
 	unsigned int clk_num;
 };
 
+struct clk_hw_onecell_data {
+	size_t num;
+	struct clk_hw *hws[];
+};
+
 extern struct of_device_id __clk_of_table;
 
 #define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn)
@@ -718,10 +723,18 @@ int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
 						   void *data),
 			void *data);
+int of_clk_add_hw_provider(struct device_node *np,
+			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+						 void *data),
+			   void *data);
 void of_clk_del_provider(struct device_node *np);
 struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 				  void *data);
+struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec,
+				    void *data);
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data);
+struct clk_hw *of_clk_hw_onecell_get(struct of_phandle_args *clkspec,
+				     void *data);
 unsigned int of_clk_get_parent_count(struct device_node *np);
 int of_clk_parent_fill(struct device_node *np, const char **parents,
 		       unsigned int size);
@@ -738,17 +751,34 @@ static inline int of_clk_add_provider(struct device_node *np,
 {
 	return 0;
 }
+static inline int of_clk_add_hw_provider(struct device_node *np,
+			struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+					      void *data),
+			void *data)
+{
+	return 0;
+}
 static inline void of_clk_del_provider(struct device_node *np) {}
 static inline struct clk *of_clk_src_simple_get(
 	struct of_phandle_args *clkspec, void *data)
 {
 	return ERR_PTR(-ENOENT);
 }
+static inline struct clk_hw *
+of_clk_hw_simple_get(struct of_phandle_args *clkspec, void *data)
+{
+	return ERR_PTR(-ENOENT);
+}
 static inline struct clk *of_clk_src_onecell_get(
 	struct of_phandle_args *clkspec, void *data)
 {
 	return ERR_PTR(-ENOENT);
 }
+static inline struct clk_hw *
+of_clk_hw_onecell_get(struct of_phandle_args *clkspec, void *data)
+{
+	return ERR_PTR(-ENOENT);
+}
 static inline int of_clk_get_parent_count(struct device_node *np)
 {
 	return 0;
-- 
cgit v1.2.3


From e4f1b49bda6d6aa2e13730ff7eeccbe65a6271f1 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 8 Feb 2016 14:59:49 -0800
Subject: clkdev: Add clk_hw based registration APIs

Now that we have a clk registration API that doesn't return
struct clks, we need to have some way to hand out struct clks via
the clk_get() APIs that doesn't involve associating struct clk
pointers with a struct clk_lookup. Luckily, clkdev already
operates on struct clk_hw pointers, except for the registration
facing APIs where it converts struct clk pointers into struct
clk_hw pointers almost immediately.

Let's add clk_hw based registration APIs so that we can skip the
conversion step and provide a way for clk provider drivers to
operate exclusively on clk_hw structs. This way we clearly
split the API between consumers and providers.

Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clkdev.c   | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clkdev.h |  6 +++++
 2 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index ae8e40a82d34..89cc700fbc37 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -301,6 +301,20 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 }
 EXPORT_SYMBOL(clkdev_alloc);
 
+struct clk_lookup *
+clkdev_hw_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, ...)
+{
+	struct clk_lookup *cl;
+	va_list ap;
+
+	va_start(ap, dev_fmt);
+	cl = vclkdev_alloc(hw, con_id, dev_fmt, ap);
+	va_end(ap);
+
+	return cl;
+}
+EXPORT_SYMBOL(clkdev_hw_alloc);
+
 /**
  * clkdev_create - allocate and add a clkdev lookup structure
  * @clk: struct clk to associate with all clk_lookups
@@ -324,6 +338,29 @@ struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
 }
 EXPORT_SYMBOL_GPL(clkdev_create);
 
+/**
+ * clkdev_hw_create - allocate and add a clkdev lookup structure
+ * @hw: struct clk_hw to associate with all clk_lookups
+ * @con_id: connection ID string on device
+ * @dev_fmt: format string describing device name
+ *
+ * Returns a clk_lookup structure, which can be later unregistered and
+ * freed.
+ */
+struct clk_lookup *clkdev_hw_create(struct clk_hw *hw, const char *con_id,
+	const char *dev_fmt, ...)
+{
+	struct clk_lookup *cl;
+	va_list ap;
+
+	va_start(ap, dev_fmt);
+	cl = vclkdev_create(hw, con_id, dev_fmt, ap);
+	va_end(ap);
+
+	return cl;
+}
+EXPORT_SYMBOL_GPL(clkdev_hw_create);
+
 int clk_add_alias(const char *alias, const char *alias_dev_name,
 	const char *con_id, struct device *dev)
 {
@@ -402,3 +439,30 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 	return cl ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL(clk_register_clkdev);
+
+/**
+ * clk_hw_register_clkdev - register one clock lookup for a struct clk_hw
+ * @hw: struct clk_hw to associate with all clk_lookups
+ * @con_id: connection ID string on device
+ * @dev_id: format string describing device name
+ *
+ * con_id or dev_id may be NULL as a wildcard, just as in the rest of
+ * clkdev.
+ */
+int clk_hw_register_clkdev(struct clk_hw *hw, const char *con_id,
+	const char *dev_id)
+{
+	struct clk_lookup *cl;
+
+	/*
+	 * Since dev_id can be NULL, and NULL is handled specially, we must
+	 * pass it as either a NULL format string, or with "%s".
+	 */
+	if (dev_id)
+		cl = __clk_register_clkdev(hw, con_id, "%s", dev_id);
+	else
+		cl = __clk_register_clkdev(hw, con_id, NULL);
+
+	return cl ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(clk_hw_register_clkdev);
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index e6f8eb1d585f..2eabc862abdb 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -15,6 +15,7 @@
 #include <asm/clkdev.h>
 
 struct clk;
+struct clk_hw;
 struct device;
 
 struct clk_lookup {
@@ -34,17 +35,22 @@ struct clk_lookup {
 
 struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
 	const char *dev_fmt, ...) __printf(3, 4);
+struct clk_lookup *clkdev_hw_alloc(struct clk_hw *hw, const char *con_id,
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
 struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
 	const char *dev_fmt, ...) __printf(3, 4);
+struct clk_lookup *clkdev_hw_create(struct clk_hw *hw, const char *con_id,
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
 int clk_register_clkdev(struct clk *, const char *, const char *);
+int clk_hw_register_clkdev(struct clk_hw *, const char *, const char *);
 
 #ifdef CONFIG_COMMON_CLK
 int __clk_get(struct clk *clk);
-- 
cgit v1.2.3


From eb7d264f3bf9ca7c093efb77bdde557c6c6e826f Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sat, 6 Feb 2016 23:26:37 -0800
Subject: clk: divider: Add hw based registration APIs

Add registration APIs in the clk divider code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-divider.c    | 91 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/clk-provider.h | 10 +++++
 2 files changed, 93 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 00e035b51c69..a0f55bc1ad3d 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -426,15 +426,16 @@ const struct clk_ops clk_divider_ro_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_divider_ro_ops);
 
-static struct clk *_register_divider(struct device *dev, const char *name,
+static struct clk_hw *_register_divider(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock)
 {
 	struct clk_divider *div;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
+	int ret;
 
 	if (clk_divider_flags & CLK_DIVIDER_HIWORD_MASK) {
 		if (width + shift > 16) {
@@ -467,12 +468,14 @@ static struct clk *_register_divider(struct device *dev, const char *name,
 	div->table = table;
 
 	/* register the clock */
-	clk = clk_register(dev, &div->hw);
-
-	if (IS_ERR(clk))
+	hw = &div->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(div);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
 }
 
 /**
@@ -492,11 +495,38 @@ struct clk *clk_register_divider(struct device *dev, const char *name,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, spinlock_t *lock)
 {
-	return _register_divider(dev, name, parent_name, flags, reg, shift,
+	struct clk_hw *hw;
+
+	hw =  _register_divider(dev, name, parent_name, flags, reg, shift,
 			width, clk_divider_flags, NULL, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_divider);
 
+/**
+ * clk_hw_register_divider - register a divider clock with the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+struct clk_hw *clk_hw_register_divider(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, spinlock_t *lock)
+{
+	return _register_divider(dev, name, parent_name, flags, reg, shift,
+			width, clk_divider_flags, NULL, lock);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_divider);
+
 /**
  * clk_register_divider_table - register a table based divider clock with
  * the clock framework
@@ -517,11 +547,41 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock)
 {
-	return _register_divider(dev, name, parent_name, flags, reg, shift,
+	struct clk_hw *hw;
+
+	hw =  _register_divider(dev, name, parent_name, flags, reg, shift,
 			width, clk_divider_flags, table, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_divider_table);
 
+/**
+ * clk_hw_register_divider_table - register a table based divider clock with
+ * the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @table: array of divider/value pairs ending with a div set to 0
+ * @lock: shared register lock for this clock
+ */
+struct clk_hw *clk_hw_register_divider_table(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, const struct clk_div_table *table,
+		spinlock_t *lock)
+{
+	return _register_divider(dev, name, parent_name, flags, reg, shift,
+			width, clk_divider_flags, table, lock);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_divider_table);
+
 void clk_unregister_divider(struct clk *clk)
 {
 	struct clk_divider *div;
@@ -537,3 +597,18 @@ void clk_unregister_divider(struct clk *clk)
 	kfree(div);
 }
 EXPORT_SYMBOL_GPL(clk_unregister_divider);
+
+/**
+ * clk_hw_unregister_divider - unregister a clk divider
+ * @hw: hardware-specific clock data to unregister
+ */
+void clk_hw_unregister_divider(struct clk_hw *hw)
+{
+	struct clk_divider *div;
+
+	div = to_clk_divider(hw);
+
+	clk_hw_unregister(hw);
+	kfree(div);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_divider);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bf8c8bb8c2cb..8885d0350596 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -407,12 +407,22 @@ struct clk *clk_register_divider(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_divider(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, spinlock_t *lock);
 struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock);
+struct clk_hw *clk_hw_register_divider_table(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, const struct clk_div_table *table,
+		spinlock_t *lock);
 void clk_unregister_divider(struct clk *clk);
+void clk_hw_unregister_divider(struct clk_hw *hw);
 
 /**
  * struct clk_mux - multiplexer clock
-- 
cgit v1.2.3


From e270d8cb13763f58107198e879cf396511ba2867 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sat, 6 Feb 2016 23:54:45 -0800
Subject: clk: gate: Add hw based registration APIs

Add registration APIs in the clk gate code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gate.c       | 43 ++++++++++++++++++++++++++++++++++++-------
 include/linux/clk-provider.h |  5 +++++
 2 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index d0d8ec8e1f1b..4e691e35483a 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -110,7 +110,7 @@ const struct clk_ops clk_gate_ops = {
 EXPORT_SYMBOL_GPL(clk_gate_ops);
 
 /**
- * clk_register_gate - register a gate clock with the clock framework
+ * clk_hw_register_gate - register a gate clock with the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of this clock's parent
@@ -120,14 +120,15 @@ EXPORT_SYMBOL_GPL(clk_gate_ops);
  * @clk_gate_flags: gate-specific flags for this clock
  * @lock: shared register lock for this clock
  */
-struct clk *clk_register_gate(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
 		u8 clk_gate_flags, spinlock_t *lock)
 {
 	struct clk_gate *gate;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
+	int ret;
 
 	if (clk_gate_flags & CLK_GATE_HIWORD_MASK) {
 		if (bit_idx > 15) {
@@ -154,12 +155,29 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	gate->lock = lock;
 	gate->hw.init = &init;
 
-	clk = clk_register(dev, &gate->hw);
-
-	if (IS_ERR(clk))
+	hw = &gate->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(gate);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_gate);
+
+struct clk *clk_register_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 bit_idx,
+		u8 clk_gate_flags, spinlock_t *lock)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_gate(dev, name, parent_name, flags, reg,
+				  bit_idx, clk_gate_flags, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_gate);
 
@@ -178,3 +196,14 @@ void clk_unregister_gate(struct clk *clk)
 	kfree(gate);
 }
 EXPORT_SYMBOL_GPL(clk_unregister_gate);
+
+void clk_hw_unregister_gate(struct clk_hw *hw)
+{
+	struct clk_gate *gate;
+
+	gate = to_clk_gate(hw);
+
+	clk_hw_unregister(hw);
+	kfree(gate);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_gate);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 8885d0350596..bf12050aadd5 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -326,7 +326,12 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
 		u8 clk_gate_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 bit_idx,
+		u8 clk_gate_flags, spinlock_t *lock);
 void clk_unregister_gate(struct clk *clk);
+void clk_hw_unregister_gate(struct clk_hw *hw);
 
 struct clk_div_table {
 	unsigned int	val;
-- 
cgit v1.2.3


From 264b31719735eb1fcbed47cecdb20f517e804856 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:05:48 -0800
Subject: clk: mux: Add hw based registration APIs

Add registration APIs in the clk mux code to return struct clk_hw
pointers instead of struct clk pointers. This way we hide the
struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-mux.c        | 57 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/clk-provider.h | 11 +++++++++
 2 files changed, 62 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 252188fd8bcd..16a3d5717f4e 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -113,16 +113,17 @@ const struct clk_ops clk_mux_ro_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_mux_ro_ops);
 
-struct clk *clk_register_mux_table(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_mux_table(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents,
 		unsigned long flags,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
 {
 	struct clk_mux *mux;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
 	u8 width = 0;
+	int ret;
 
 	if (clk_mux_flags & CLK_MUX_HIWORD_MASK) {
 		width = fls(mask) - ffs(mask) + 1;
@@ -157,12 +158,31 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 	mux->table = table;
 	mux->hw.init = &init;
 
-	clk = clk_register(dev, &mux->hw);
-
-	if (IS_ERR(clk))
+	hw = &mux->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(mux);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_mux_table);
+
+struct clk *clk_register_mux_table(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_mux_table(dev, name, parent_names, num_parents,
+				       flags, reg, shift, mask, clk_mux_flags,
+				       table, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_mux_table);
 
@@ -180,6 +200,20 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 }
 EXPORT_SYMBOL_GPL(clk_register_mux);
 
+struct clk_hw *clk_hw_register_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_mux_flags, spinlock_t *lock)
+{
+	u32 mask = BIT(width) - 1;
+
+	return clk_hw_register_mux_table(dev, name, parent_names, num_parents,
+				      flags, reg, shift, mask, clk_mux_flags,
+				      NULL, lock);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_mux);
+
 void clk_unregister_mux(struct clk *clk)
 {
 	struct clk_mux *mux;
@@ -195,3 +229,14 @@ void clk_unregister_mux(struct clk *clk)
 	kfree(mux);
 }
 EXPORT_SYMBOL_GPL(clk_unregister_mux);
+
+void clk_hw_unregister_mux(struct clk_hw *hw)
+{
+	struct clk_mux *mux;
+
+	mux = to_clk_mux(hw);
+
+	clk_hw_unregister(hw);
+	kfree(mux);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_mux);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bf12050aadd5..d690d99b9c1c 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -478,14 +478,25 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 		unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_mux_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_mux_flags, spinlock_t *lock);
 
 struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents,
 		unsigned long flags,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
+struct clk_hw *clk_hw_register_mux_table(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
 
 void clk_unregister_mux(struct clk *clk);
+void clk_hw_unregister_mux(struct clk_hw *hw);
 
 void of_fixed_factor_clk_setup(struct device_node *node);
 
-- 
cgit v1.2.3


From 0759ac8a73dc2c8cc8ac697fbe5dbd8d67348d37 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:11:06 -0800
Subject: clk: fixed-factor: Add hw based registration APIs

Add registration APIs in the clk fixed-factor code to return
struct clk_hw pointers instead of struct clk pointers. This way
we hide the struct clk pointer from providers unless they need to
use consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fixed-factor.c | 42 +++++++++++++++++++++++++++++++++++-------
 include/linux/clk-provider.h   |  4 ++++
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index 053448e2453d..75cd6c792cb8 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -68,13 +68,14 @@ const struct clk_ops clk_fixed_factor_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_fixed_factor_ops);
 
-struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
-		const char *parent_name, unsigned long flags,
+struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
 	struct clk_fixed_factor *fix;
 	struct clk_init_data init;
-	struct clk *clk;
+	struct clk_hw *hw;
+	int ret;
 
 	fix = kmalloc(sizeof(*fix), GFP_KERNEL);
 	if (!fix)
@@ -91,12 +92,28 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 	init.parent_names = &parent_name;
 	init.num_parents = 1;
 
-	clk = clk_register(dev, &fix->hw);
-
-	if (IS_ERR(clk))
+	hw = &fix->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(fix);
+		hw = ERR_PTR(ret);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor);
+
+struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		unsigned int mult, unsigned int div)
+{
+	struct clk_hw *hw;
 
-	return clk;
+	hw = clk_hw_register_fixed_factor(dev, name, parent_name, flags, mult,
+					  div);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_fixed_factor);
 
@@ -113,6 +130,17 @@ void clk_unregister_fixed_factor(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_unregister_fixed_factor);
 
+void clk_hw_unregister_fixed_factor(struct clk_hw *hw)
+{
+	struct clk_fixed_factor *fix;
+
+	fix = to_clk_fixed_factor(hw);
+
+	clk_hw_unregister(hw);
+	kfree(fix);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_fixed_factor);
+
 #ifdef CONFIG_OF
 /**
  * of_fixed_factor_clk_setup() - Setup function for simple fixed factor clock
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d690d99b9c1c..79ad1a8a6831 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -525,6 +525,10 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div);
 void clk_unregister_fixed_factor(struct clk *clk);
+struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned int mult, unsigned int div);
+void clk_hw_unregister_fixed_factor(struct clk_hw *hw);
 
 /**
  * struct clk_fractional_divider - adjustable fractional divider clock
-- 
cgit v1.2.3


From 39b44cff4ad4af6d7abd9dd2acb288b005c26503 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:15:09 -0800
Subject: clk: fractional-divider: Add hw based registration APIs

Add registration APIs in the clk fractional divider code to
return struct clk_hw pointers instead of struct clk pointers.
This way we hide the struct clk pointer from providers unless
they need to use consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fractional-divider.c | 40 +++++++++++++++++++++++++++++++-----
 include/linux/clk-provider.h         |  5 +++++
 2 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fractional-divider.c b/drivers/clk/clk-fractional-divider.c
index 1abcd76b4993..aab904618eb6 100644
--- a/drivers/clk/clk-fractional-divider.c
+++ b/drivers/clk/clk-fractional-divider.c
@@ -116,14 +116,15 @@ const struct clk_ops clk_fractional_divider_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_fractional_divider_ops);
 
-struct clk *clk_register_fractional_divider(struct device *dev,
+struct clk_hw *clk_hw_register_fractional_divider(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
 		u8 clk_divider_flags, spinlock_t *lock)
 {
 	struct clk_fractional_divider *fd;
 	struct clk_init_data init;
-	struct clk *clk;
+	struct clk_hw *hw;
+	int ret;
 
 	fd = kzalloc(sizeof(*fd), GFP_KERNEL);
 	if (!fd)
@@ -146,10 +147,39 @@ struct clk *clk_register_fractional_divider(struct device *dev,
 	fd->lock = lock;
 	fd->hw.init = &init;
 
-	clk = clk_register(dev, &fd->hw);
-	if (IS_ERR(clk))
+	hw = &fd->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(fd);
+		hw = ERR_PTR(ret);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fractional_divider);
 
-	return clk;
+struct clk *clk_register_fractional_divider(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
+		u8 clk_divider_flags, spinlock_t *lock)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_fractional_divider(dev, name, parent_name, flags,
+			reg, mshift, mwidth, nshift, nwidth, clk_divider_flags,
+			lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_fractional_divider);
+
+void clk_hw_unregister_fractional_divider(struct clk_hw *hw)
+{
+	struct clk_fractional_divider *fd;
+
+	fd = to_clk_fd(hw);
+
+	clk_hw_unregister(hw);
+	kfree(fd);
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 79ad1a8a6831..bcbaf6c95d52 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -563,6 +563,11 @@ struct clk *clk_register_fractional_divider(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
 		u8 clk_divider_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_fractional_divider(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
+		u8 clk_divider_flags, spinlock_t *lock);
+void clk_hw_unregister_fractional_divider(struct clk_hw *hw);
 
 /**
  * struct clk_multiplier - adjustable multiplier clock
-- 
cgit v1.2.3


From 49cb392d36397a296dcd51ec57cf83585a89a94a Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:20:31 -0800
Subject: clk: composite: Add hw based registration APIs

Add registration APIs in the clk composite code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-composite.c  | 45 ++++++++++++++++++++++++++++++++------------
 include/linux/clk-provider.h |  7 +++++++
 2 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 1f903e1f86a2..463fadd5a68f 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -184,17 +184,18 @@ static void clk_composite_disable(struct clk_hw *hw)
 	gate_ops->disable(gate_hw);
 }
 
-struct clk *clk_register_composite(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_composite(struct device *dev, const char *name,
 			const char * const *parent_names, int num_parents,
 			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
 			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 			unsigned long flags)
 {
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
 	struct clk_composite *composite;
 	struct clk_ops *clk_composite_ops;
+	int ret;
 
 	composite = kzalloc(sizeof(*composite), GFP_KERNEL);
 	if (!composite)
@@ -204,12 +205,13 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	init.flags = flags | CLK_IS_BASIC;
 	init.parent_names = parent_names;
 	init.num_parents = num_parents;
+	hw = &composite->hw;
 
 	clk_composite_ops = &composite->ops;
 
 	if (mux_hw && mux_ops) {
 		if (!mux_ops->get_parent) {
-			clk = ERR_PTR(-EINVAL);
+			hw = ERR_PTR(-EINVAL);
 			goto err;
 		}
 
@@ -224,7 +226,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 
 	if (rate_hw && rate_ops) {
 		if (!rate_ops->recalc_rate) {
-			clk = ERR_PTR(-EINVAL);
+			hw = ERR_PTR(-EINVAL);
 			goto err;
 		}
 		clk_composite_ops->recalc_rate = clk_composite_recalc_rate;
@@ -253,7 +255,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	if (gate_hw && gate_ops) {
 		if (!gate_ops->is_enabled || !gate_ops->enable ||
 		    !gate_ops->disable) {
-			clk = ERR_PTR(-EINVAL);
+			hw = ERR_PTR(-EINVAL);
 			goto err;
 		}
 
@@ -267,22 +269,41 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	init.ops = clk_composite_ops;
 	composite->hw.init = &init;
 
-	clk = clk_register(dev, &composite->hw);
-	if (IS_ERR(clk))
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
+		hw = ERR_PTR(ret);
 		goto err;
+	}
 
 	if (composite->mux_hw)
-		composite->mux_hw->clk = clk;
+		composite->mux_hw->clk = hw->clk;
 
 	if (composite->rate_hw)
-		composite->rate_hw->clk = clk;
+		composite->rate_hw->clk = hw->clk;
 
 	if (composite->gate_hw)
-		composite->gate_hw->clk = clk;
+		composite->gate_hw->clk = hw->clk;
 
-	return clk;
+	return hw;
 
 err:
 	kfree(composite);
-	return clk;
+	return hw;
+}
+
+struct clk *clk_register_composite(struct device *dev, const char *name,
+			const char * const *parent_names, int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_composite(dev, name, parent_names, num_parents,
+			mux_hw, mux_ops, rate_hw, rate_ops, gate_hw, gate_ops,
+			flags);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bcbaf6c95d52..456c3ced1ac9 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -638,6 +638,13 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
+struct clk_hw *clk_hw_register_composite(struct device *dev, const char *name,
+		const char * const *parent_names, int num_parents,
+		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+		unsigned long flags);
+void clk_hw_unregister_composite(struct clk_hw *hw);
 
 /***
  * struct clk_gpio_gate - gpio gated clock
-- 
cgit v1.2.3


From b120743a64a3ec68b8c5310a6009094329b4a33b Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:27:55 -0800
Subject: clk: gpio: Add hw based registration APIs

Add registration APIs in the clk gpio code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gpio.c       | 52 ++++++++++++++++++++++++++++++++++----------
 include/linux/clk-provider.h |  8 +++++++
 2 files changed, 49 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gpio.c b/drivers/clk/clk-gpio.c
index 08f65acc5d57..86b245746a6b 100644
--- a/drivers/clk/clk-gpio.c
+++ b/drivers/clk/clk-gpio.c
@@ -94,13 +94,13 @@ const struct clk_ops clk_gpio_mux_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_gpio_mux_ops);
 
-static struct clk *clk_register_gpio(struct device *dev, const char *name,
+static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags,
 		const struct clk_ops *clk_gpio_ops)
 {
 	struct clk_gpio *clk_gpio;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init = {};
 	unsigned long gpio_flags;
 	int err;
@@ -141,24 +141,26 @@ static struct clk *clk_register_gpio(struct device *dev, const char *name,
 	clk_gpio->gpiod = gpio_to_desc(gpio);
 	clk_gpio->hw.init = &init;
 
+	hw = &clk_gpio->hw;
 	if (dev)
-		clk = devm_clk_register(dev, &clk_gpio->hw);
+		err = devm_clk_hw_register(dev, hw);
 	else
-		clk = clk_register(NULL, &clk_gpio->hw);
+		err = clk_hw_register(NULL, hw);
 
-	if (!IS_ERR(clk))
-		return clk;
+	if (!err)
+		return hw;
 
 	if (!dev) {
 		gpiod_put(clk_gpio->gpiod);
 		kfree(clk_gpio);
 	}
 
-	return clk;
+	return ERR_PTR(err);
 }
 
 /**
- * clk_register_gpio_gate - register a gpio clock gate with the clock framework
+ * clk_hw_register_gpio_gate - register a gpio clock gate with the clock
+ * framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of this clock's parent
@@ -166,7 +168,7 @@ static struct clk *clk_register_gpio(struct device *dev, const char *name,
  * @active_low: true if gpio should be set to 0 to enable clock
  * @flags: clock flags
  */
-struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_gpio_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned gpio, bool active_low,
 		unsigned long flags)
 {
@@ -175,10 +177,24 @@ struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
 			(parent_name ? 1 : 0), gpio, active_low, flags,
 			&clk_gpio_gate_ops);
 }
+EXPORT_SYMBOL_GPL(clk_hw_register_gpio_gate);
+
+struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned gpio, bool active_low,
+		unsigned long flags)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_gpio_gate(dev, name, parent_name, gpio, active_low,
+				       flags);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
+}
 EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
 
 /**
- * clk_register_gpio_mux - register a gpio clock mux with the clock framework
+ * clk_hw_register_gpio_mux - register a gpio clock mux with the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_names: names of this clock's parents
@@ -187,7 +203,7 @@ EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
  * @active_low: true if gpio should be set to 0 to enable clock
  * @flags: clock flags
  */
-struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags)
 {
@@ -199,6 +215,20 @@ struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
 	return clk_register_gpio(dev, name, parent_names, num_parents,
 			gpio, active_low, flags, &clk_gpio_mux_ops);
 }
+EXPORT_SYMBOL_GPL(clk_hw_register_gpio_mux);
+
+struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_gpio_mux(dev, name, parent_names, num_parents,
+			gpio, active_low, flags);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
+}
 EXPORT_SYMBOL_GPL(clk_register_gpio_mux);
 
 static int gpio_clk_driver_probe(struct platform_device *pdev)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 456c3ced1ac9..6c36c5e8ccbe 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -667,6 +667,10 @@ extern const struct clk_ops clk_gpio_gate_ops;
 struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned gpio, bool active_low,
 		unsigned long flags);
+struct clk_hw *clk_hw_register_gpio_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned gpio, bool active_low,
+		unsigned long flags);
+void clk_hw_unregister_gpio_gate(struct clk_hw *hw);
 
 /**
  * struct clk_gpio_mux - gpio controlled clock multiplexer
@@ -682,6 +686,10 @@ extern const struct clk_ops clk_gpio_mux_ops;
 struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags);
+struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags);
+void clk_hw_unregister_gpio_mux(struct clk_hw *hw);
 
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
-- 
cgit v1.2.3


From 26ef56be9e0944a9b136169eb47140f309ce745b Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:34:13 -0800
Subject: clk: fixed-rate: Add hw based registration APIs

Add registration APIs in the clk fixed-rate code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fixed-rate.c | 44 ++++++++++++++++++++++++++++++++++++--------
 include/linux/clk-provider.h |  7 +++++++
 2 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-rate.c b/drivers/clk/clk-fixed-rate.c
index cd9dc925b3f8..8e4453eb54e8 100644
--- a/drivers/clk/clk-fixed-rate.c
+++ b/drivers/clk/clk-fixed-rate.c
@@ -45,8 +45,8 @@ const struct clk_ops clk_fixed_rate_ops = {
 EXPORT_SYMBOL_GPL(clk_fixed_rate_ops);
 
 /**
- * clk_register_fixed_rate_with_accuracy - register fixed-rate clock with the
- *					   clock framework
+ * clk_hw_register_fixed_rate_with_accuracy - register fixed-rate clock with
+ * the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of clock's parent
@@ -54,13 +54,14 @@ EXPORT_SYMBOL_GPL(clk_fixed_rate_ops);
  * @fixed_rate: non-adjustable clock rate
  * @fixed_accuracy: non-adjustable clock rate
  */
-struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
+struct clk_hw *clk_hw_register_fixed_rate_with_accuracy(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate, unsigned long fixed_accuracy)
 {
 	struct clk_fixed_rate *fixed;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
+	int ret;
 
 	/* allocate fixed-rate clock */
 	fixed = kzalloc(sizeof(*fixed), GFP_KERNEL);
@@ -79,22 +80,49 @@ struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
 	fixed->hw.init = &init;
 
 	/* register the clock */
-	clk = clk_register(dev, &fixed->hw);
-	if (IS_ERR(clk))
+	hw = &fixed->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(fixed);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_rate_with_accuracy);
+
+struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate, unsigned long fixed_accuracy)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_fixed_rate_with_accuracy(dev, name, parent_name,
+			flags, fixed_rate, fixed_accuracy);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_fixed_rate_with_accuracy);
 
 /**
- * clk_register_fixed_rate - register fixed-rate clock with the clock framework
+ * clk_hw_register_fixed_rate - register fixed-rate clock with the clock
+ * framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of clock's parent
  * @flags: framework-specific flags
  * @fixed_rate: non-adjustable clock rate
  */
+struct clk_hw *clk_hw_register_fixed_rate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate)
+{
+	return clk_hw_register_fixed_rate_with_accuracy(dev, name, parent_name,
+						     flags, fixed_rate, 0);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_rate);
+
 struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 6c36c5e8ccbe..c3fc042d517c 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -282,10 +282,17 @@ extern const struct clk_ops clk_fixed_rate_ops;
 struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate);
+struct clk_hw *clk_hw_register_fixed_rate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate);
 struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate, unsigned long fixed_accuracy);
 void clk_unregister_fixed_rate(struct clk *clk);
+struct clk_hw *clk_hw_register_fixed_rate_with_accuracy(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate, unsigned long fixed_accuracy);
+
 void of_fixed_clk_setup(struct device_node *np);
 
 /**
-- 
cgit v1.2.3


From 2e4682ba2ed79d8082b78d292b3b80f54d970b7a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 10 Mar 2016 16:30:22 +0100
Subject: KVM: add missing memory barrier in kvm_{make,check}_request

kvm_make_request and kvm_check_request imply a producer-consumer
relationship; add implicit memory barriers to them.  There was indeed
already a place that was adding an explicit smp_mb() to order between
kvm_check_request and the processing of the request.  That memory
barrier can be removed (as an added benefit, kvm_check_request can use
smp_mb__after_atomic which is free on x86).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c  |  3 ---
 include/linux/kvm_host.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 54ead79e444b..dfb4c6476877 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -382,9 +382,6 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 	u32 i, nr_ioapic_pins;
 	int idx;
 
-	/* kvm->irq_routing must be read after clearing
-	 * KVM_SCAN_IOAPIC. */
-	smp_mb();
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5276fe0916fc..ad40d44784c7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1091,6 +1091,11 @@ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 {
+	/*
+	 * Ensure the rest of the request is published to kvm_check_request's
+	 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
+	 */
+	smp_wmb();
 	set_bit(req, &vcpu->requests);
 }
 
@@ -1098,6 +1103,12 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
 	if (test_bit(req, &vcpu->requests)) {
 		clear_bit(req, &vcpu->requests);
+
+		/*
+		 * Ensure the rest of the request is visible to kvm_check_request's
+		 * caller.  Paired with the smp_wmb in kvm_make_request.
+		 */
+		smp_mb__after_atomic();
 		return true;
 	} else {
 		return false;
-- 
cgit v1.2.3


From 9257b4a206fc0229dd5f84b78e4d1ebf3f91d270 Mon Sep 17 00:00:00 2001
From: Omer Peleg <omer@cs.technion.ac.il>
Date: Wed, 20 Apr 2016 11:34:11 +0300
Subject: iommu/iova: introduce per-cpu caching to iova allocation

IOVA allocation has two problems that impede high-throughput I/O.
First, it can do a linear search over the allocated IOVA ranges.
Second, the rbtree spinlock that serializes IOVA allocations becomes
contended.

Address these problems by creating an API for caching allocated IOVA
ranges, so that the IOVA allocator isn't accessed frequently.  This
patch adds a per-CPU cache, from which CPUs can alloc/free IOVAs
without taking the rbtree spinlock.  The per-CPU caches are backed by
a global cache, to avoid invoking the (linear-time) IOVA allocator
without needing to make the per-CPU cache size excessive.  This design
is based on magazines, as described in "Magazines and Vmem: Extending
the Slab Allocator to Many CPUs and Arbitrary Resources" (currently
available at https://www.usenix.org/legacy/event/usenix01/bonwick.html)

Adding caching on top of the existing rbtree allocator maintains the
property that IOVAs are densely packed in the IO virtual address space,
which is important for keeping IOMMU page table usage low.

To keep the cache size reasonable, we bound the IOVA space a CPU can
cache by 32 MiB (we cache a bounded number of IOVA ranges, and only
ranges of size <= 128 KiB).  The shared global cache is bounded at
4 MiB of IOVA space.

Signed-off-by: Omer Peleg <omer@cs.technion.ac.il>
[mad@cs.technion.ac.il: rebased, cleaned up and reworded the commit message]
Signed-off-by: Adam Morrison <mad@cs.technion.ac.il>
Reviewed-by: Shaohua Li <shli@fb.com>
Reviewed-by: Ben Serebrin <serebrin@google.com>
[dwmw2: split out VT-d part into a separate patch]
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/iommu/iova.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/iova.h |  23 ++-
 2 files changed, 414 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index fa0adef32bd6..ba764a0835d3 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -20,6 +20,17 @@
 #include <linux/iova.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+			       unsigned long pfn,
+			       unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+				     unsigned long size,
+				     unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -38,6 +49,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
+	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -291,33 +303,18 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	unsigned long flags;
-	struct rb_node *node;
+	struct rb_node *node = iovad->rbroot.rb_node;
+
+	assert_spin_locked(&iovad->iova_rbtree_lock);
 
-	/* Take the lock so that no other thread is manipulating the rbtree */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	node = iovad->rbroot.rb_node;
 	while (node) {
 		struct iova *iova = container_of(node, struct iova, node);
 
 		/* If pfn falls within iova's range, return iova */
 		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			/* We are not holding the lock while this iova
-			 * is referenced by the caller as the same thread
-			 * which called this function also calls __free_iova()
-			 * and it is by design that only one thread can possibly
-			 * reference a particular iova and hence no conflict.
-			 */
 			return iova;
 		}
 
@@ -327,9 +324,35 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
 			node = node->rb_right;
 	}
 
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 	return NULL;
 }
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	assert_spin_locked(&iovad->iova_rbtree_lock);
+	__cached_rbnode_delete_update(iovad, iova);
+	rb_erase(&iova->node, &iovad->rbroot);
+	free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct iova *iova;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = private_find_iova(iovad, pfn);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -344,10 +367,8 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	__cached_rbnode_delete_update(iovad, iova);
-	rb_erase(&iova->node, &iovad->rbroot);
+	private_free_iova(iovad, iova);
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -369,6 +390,63 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+		unsigned long limit_pfn)
+{
+	bool flushed_rcache = false;
+	unsigned long iova_pfn;
+	struct iova *new_iova;
+
+	iova_pfn = iova_rcache_get(iovad, size, limit_pfn);
+	if (iova_pfn)
+		return iova_pfn;
+
+retry:
+	new_iova = alloc_iova(iovad, size, limit_pfn, true);
+	if (!new_iova) {
+		unsigned int cpu;
+
+		if (flushed_rcache)
+			return 0;
+
+		/* Try replenishing IOVAs by flushing rcache. */
+		flushed_rcache = true;
+		for_each_online_cpu(cpu)
+			free_cpu_cached_iovas(cpu, iovad);
+		goto retry;
+	}
+
+	return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+	if (iova_rcache_insert(iovad, pfn, size))
+		return;
+
+	free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
@@ -379,6 +457,7 @@ void put_iova_domain(struct iova_domain *iovad)
 	struct rb_node *node;
 	unsigned long flags;
 
+	free_iova_rcaches(iovad);
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	node = rb_first(&iovad->rbroot);
 	while (node) {
@@ -550,5 +629,295 @@ error:
 	return NULL;
 }
 
+/*
+ * Magazine caches for IOVA ranges.  For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+	unsigned long size;
+	unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+	spinlock_t lock;
+	struct iova_magazine *loaded;
+	struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+	return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+	kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+	unsigned long flags;
+	int i;
+
+	if (!mag)
+		return;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+	for (i = 0 ; i < mag->size; ++i) {
+		struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+		BUG_ON(!iova);
+		private_free_iova(iovad, iova);
+	}
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+	mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+	return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+	return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+				       unsigned long limit_pfn)
+{
+	BUG_ON(iova_magazine_empty(mag));
+
+	if (mag->pfns[mag->size - 1] >= limit_pfn)
+		return 0;
+
+	return mag->pfns[--mag->size];
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+	BUG_ON(iova_magazine_full(mag));
+
+	mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_rcache *rcache;
+	unsigned int cpu;
+	int i;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		spin_lock_init(&rcache->lock);
+		rcache->depot_size = 0;
+		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+		if (WARN_ON(!rcache->cpu_rcaches))
+			continue;
+		for_each_possible_cpu(cpu) {
+			cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+			spin_lock_init(&cpu_rcache->lock);
+			cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+			cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+		}
+	}
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success.  Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+				 struct iova_rcache *rcache,
+				 unsigned long iova_pfn)
+{
+	struct iova_magazine *mag_to_free = NULL;
+	struct iova_cpu_rcache *cpu_rcache;
+	bool can_insert = false;
+	unsigned long flags;
+
+	cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	if (!iova_magazine_full(cpu_rcache->loaded)) {
+		can_insert = true;
+	} else if (!iova_magazine_full(cpu_rcache->prev)) {
+		swap(cpu_rcache->prev, cpu_rcache->loaded);
+		can_insert = true;
+	} else {
+		struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+		if (new_mag) {
+			spin_lock(&rcache->lock);
+			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+				rcache->depot[rcache->depot_size++] =
+						cpu_rcache->loaded;
+			} else {
+				mag_to_free = cpu_rcache->loaded;
+			}
+			spin_unlock(&rcache->lock);
+
+			cpu_rcache->loaded = new_mag;
+			can_insert = true;
+		}
+	}
+
+	if (can_insert)
+		iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+	if (mag_to_free) {
+		iova_magazine_free_pfns(mag_to_free, iovad);
+		iova_magazine_free(mag_to_free);
+	}
+
+	return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+			       unsigned long size)
+{
+	unsigned int log_size = order_base_2(size);
+
+	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+		return false;
+
+	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'.  If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+				       unsigned long limit_pfn)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	unsigned long iova_pfn = 0;
+	bool has_pfn = false;
+	unsigned long flags;
+
+	cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	if (!iova_magazine_empty(cpu_rcache->loaded)) {
+		has_pfn = true;
+	} else if (!iova_magazine_empty(cpu_rcache->prev)) {
+		swap(cpu_rcache->prev, cpu_rcache->loaded);
+		has_pfn = true;
+	} else {
+		spin_lock(&rcache->lock);
+		if (rcache->depot_size > 0) {
+			iova_magazine_free(cpu_rcache->loaded);
+			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+			has_pfn = true;
+		}
+		spin_unlock(&rcache->lock);
+	}
+
+	if (has_pfn)
+		iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+	return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache.  Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+				     unsigned long size,
+				     unsigned long limit_pfn)
+{
+	unsigned int log_size = order_base_2(size);
+
+	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+		return 0;
+
+	return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn);
+}
+
+/*
+ * Free a cpu's rcache.
+ */
+static void free_cpu_iova_rcache(unsigned int cpu, struct iova_domain *iovad,
+				 struct iova_rcache *rcache)
+{
+	struct iova_cpu_rcache *cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+	iova_magazine_free(cpu_rcache->loaded);
+
+	iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+	iova_magazine_free(cpu_rcache->prev);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	unsigned int cpu;
+	int i, j;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		for_each_possible_cpu(cpu)
+			free_cpu_iova_rcache(cpu, iovad, rcache);
+		spin_lock_irqsave(&rcache->lock, flags);
+		free_percpu(rcache->cpu_rcaches);
+		for (j = 0; j < rcache->depot_size; ++j) {
+			iova_magazine_free_pfns(rcache->depot[j], iovad);
+			iova_magazine_free(rcache->depot[j]);
+		}
+		spin_unlock_irqrestore(&rcache->lock, flags);
+	}
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+		spin_lock_irqsave(&cpu_rcache->lock, flags);
+		iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+		iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+		spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+	}
+}
+
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 92f7177db2ce..f27bb2c62fca 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -19,8 +19,21 @@
 /* iova structure */
 struct iova {
 	struct rb_node	node;
-	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
-	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	unsigned long	pfn_hi; /* Highest allocated pfn */
+	unsigned long	pfn_lo; /* Lowest allocated pfn */
+};
+
+struct iova_magazine;
+struct iova_cpu_rcache;
+
+#define IOVA_RANGE_CACHE_MAX_SIZE 6	/* log of max cached IOVA range size (in pages) */
+#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
+
+struct iova_rcache {
+	spinlock_t lock;
+	unsigned long depot_size;
+	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
 /* holds all the iova translations for a domain */
@@ -31,6 +44,7 @@ struct iova_domain {
 	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
+	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -78,6 +92,10 @@ void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
 	bool size_aligned);
+void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
+		    unsigned long size);
+unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+			      unsigned long limit_pfn);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
@@ -87,5 +105,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 
 #endif
-- 
cgit v1.2.3


From 80e0f8d94d3090f0f7bf3faf3e6180e920ee0d22 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 24 Feb 2016 14:12:59 +0530
Subject: pinctrl: Add devm_ apis for pinctrl_{register, unregister}

Add device managed APIs devm_pinctrl_register() and
devm_pinctrl_unregister() for the APIs pinctrl_register()
and pinctrl_unregister().

This helps in reducing code in error path and sometimes
removal of .remove callback for driver unbind.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 63 +++++++++++++++++++++++++++++++++++++++++
 include/linux/pinctrl/pinctrl.h |  6 ++++
 2 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index f67a8b7a4e18..21df52e8192a 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1872,6 +1872,69 @@ void pinctrl_unregister(struct pinctrl_dev *pctldev)
 }
 EXPORT_SYMBOL_GPL(pinctrl_unregister);
 
+static void devm_pinctrl_dev_release(struct device *dev, void *res)
+{
+	struct pinctrl_dev *pctldev = *(struct pinctrl_dev **)res;
+
+	pinctrl_unregister(pctldev);
+}
+
+static int devm_pinctrl_dev_match(struct device *dev, void *res, void *data)
+{
+	struct pctldev **r = res;
+
+	if (WARN_ON(!r || !*r)
+		return 0;
+
+	return *r == data;
+}
+
+/**
+ * devm_pinctrl_register() - Resource managed version of pinctrl_register().
+ * @dev: parent device for this pin controller
+ * @pctldesc: descriptor for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ *
+ * Returns an error pointer if pincontrol register failed. Otherwise
+ * it returns valid pinctrl handle.
+ *
+ * The pinctrl device will be automatically released when the device is unbound.
+ */
+struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
+					  struct pinctrl_desc *pctldesc,
+					  void *driver_data)
+{
+	struct pinctrl_dev **ptr, *pctldev;
+
+	ptr = devres_alloc(devm_pinctrl_dev_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	pctldev = pinctrl_register(pctldesc, dev, driver_data);
+	if (IS_ERR(pctldev)) {
+		devres_free(ptr);
+		return pctldev;
+	}
+
+	*ptr = pctldev;
+	devres_add(dev, ptr);
+
+	return pctldev;
+}
+EXPORT_SYMBOL_GPL(devm_pinctrl_register);
+
+/**
+ * devm_pinctrl_unregister() - Resource managed version of pinctrl_unregister().
+ * @dev: device for which which resource was allocated
+ * @pctldev: the pinctrl device to unregister.
+ */
+void devm_pinctrl_unregister(struct device *dev, struct pinctrl_dev *pctldev)
+{
+	WARN_ON(devres_release(dev, devm_pinctrl_dev_release,
+			       devm_pinctrl_dev_match, pctldev));
+}
+EXPORT_SYMBOL_GPL(devm_pinctrl_unregister);
+
 static int __init pinctrl_init(void)
 {
 	pr_info("initialized pinctrl subsystem\n");
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 9ba59fcba549..a42e57da270d 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -144,6 +144,12 @@ struct pinctrl_desc {
 extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 				struct device *dev, void *driver_data);
 extern void pinctrl_unregister(struct pinctrl_dev *pctldev);
+extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
+				struct pinctrl_desc *pctldesc,
+				void *driver_data);
+extern void devm_pinctrl_unregister(struct device *dev,
+				struct pinctrl_dev *pctldev);
+
 extern bool pin_is_valid(struct pinctrl_dev *pctldev, int pin);
 extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
-- 
cgit v1.2.3


From b53f27e4fa0d0e72d897830cc4f3f83d2a25d952 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:23 -0700
Subject: string_helpers: add kstrdup_quotable

Handle allocating and escaping a string safe for logging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/string_helpers.h |  2 ++
 lib/string_helpers.c           | 28 ++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index dabe643eb5fa..9de228af00c1 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -68,4 +68,6 @@ static inline int string_escape_str_any_np(const char *src, char *dst,
 	return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
 }
 
+char *kstrdup_quotable(const char *src, gfp_t gfp);
+
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5c88204b6f1f..aa00c9f989ee 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,7 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string_helpers.h>
 
@@ -534,3 +535,30 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 	return p - dst;
 }
 EXPORT_SYMBOL(string_escape_mem);
+
+/*
+ * Return an allocated string that has been escaped of special characters
+ * and double quotes, making it safe to log in quotes.
+ */
+char *kstrdup_quotable(const char *src, gfp_t gfp)
+{
+	size_t slen, dlen;
+	char *dst;
+	const int flags = ESCAPE_HEX;
+	const char esc[] = "\f\n\r\t\v\a\e\\\"";
+
+	if (!src)
+		return NULL;
+	slen = strlen(src);
+
+	dlen = string_escape_mem(src, slen, NULL, 0, flags, esc);
+	dst = kmalloc(dlen + 1, gfp);
+	if (!dst)
+		return NULL;
+
+	WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen);
+	dst[dlen] = '\0';
+
+	return dst;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable);
-- 
cgit v1.2.3


From 0d0443288f2244d7054796086e481ddef6abdbba Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:24 -0700
Subject: string_helpers: add kstrdup_quotable_cmdline

Provide an escaped (but readable: no inter-argument NULLs) commandline
safe for logging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/string_helpers.h |  1 +
 lib/string_helpers.c           | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 9de228af00c1..684d2695fc36 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -69,5 +69,6 @@ static inline int string_escape_str_any_np(const char *src, char *dst,
 }
 
 char *kstrdup_quotable(const char *src, gfp_t gfp);
+char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp);
 
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index aa00c9f989ee..b16ee85aaf87 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,7 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string_helpers.h>
@@ -562,3 +563,36 @@ char *kstrdup_quotable(const char *src, gfp_t gfp)
 	return dst;
 }
 EXPORT_SYMBOL_GPL(kstrdup_quotable);
+
+/*
+ * Returns allocated NULL-terminated string containing process
+ * command line, with inter-argument NULLs replaced with spaces,
+ * and other special characters escaped.
+ */
+char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
+{
+	char *buffer, *quoted;
+	int i, res;
+
+	buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY);
+	if (!buffer)
+		return NULL;
+
+	res = get_cmdline(task, buffer, PAGE_SIZE - 1);
+	buffer[res] = '\0';
+
+	/* Collapse trailing NULLs, leave res pointing to last non-NULL. */
+	while (--res >= 0 && buffer[res] == '\0')
+		;
+
+	/* Replace inter-argument NULLs. */
+	for (i = 0; i <= res; i++)
+		if (buffer[i] == '\0')
+			buffer[i] = ' ';
+
+	/* Make sure result is printable. */
+	quoted = kstrdup_quotable(buffer, gfp);
+	kfree(buffer);
+	return quoted;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);
-- 
cgit v1.2.3


From 21985319add60b55fc27230d9421a3e5af7e998a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:25 -0700
Subject: string_helpers: add kstrdup_quotable_file

Allocate a NULL-terminated file path with special characters escaped,
safe for logging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/string_helpers.h |  3 +++
 lib/string_helpers.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 684d2695fc36..5ce9538f290e 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -3,6 +3,8 @@
 
 #include <linux/types.h>
 
+struct file;
+
 /* Descriptions of the types of units to
  * print in */
 enum string_size_units {
@@ -70,5 +72,6 @@ static inline int string_escape_str_any_np(const char *src, char *dst,
 
 char *kstrdup_quotable(const char *src, gfp_t gfp);
 char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp);
+char *kstrdup_quotable_file(struct file *file, gfp_t gfp);
 
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index b16ee85aaf87..ecaac2c0526f 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,8 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -596,3 +598,31 @@ char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
 	return quoted;
 }
 EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);
+
+/*
+ * Returns allocated NULL-terminated string containing pathname,
+ * with special characters escaped, able to be safely logged. If
+ * there is an error, the leading character will be "<".
+ */
+char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
+{
+	char *temp, *pathname;
+
+	if (!file)
+		return kstrdup("<unknown>", gfp);
+
+	/* We add 11 spaces for ' (deleted)' to be appended */
+	temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY);
+	if (!temp)
+		return kstrdup("<no_memory>", gfp);
+
+	pathname = file_path(file, temp, PATH_MAX + 11);
+	if (IS_ERR(pathname))
+		pathname = kstrdup("<too_long>", gfp);
+	else
+		pathname = kstrdup_quotable(pathname, gfp);
+
+	kfree(temp);
+	return pathname;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_file);
-- 
cgit v1.2.3


From 1284ab5b2dcb927d38e4f3fbc2e307f3d1af9262 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 20 Apr 2016 15:46:27 -0700
Subject: fs: define a string representation of the kernel_read_file_id
 enumeration

A string representation of the kernel_read_file_id enumeration is
needed for displaying messages (eg. pr_info, auditing) that can be
used by multiple LSMs and the integrity subsystem.  To simplify
keeping the list of strings up to date with the enumeration, this
patch defines two new preprocessing macros named __fid_enumify and
__fid_stringify to create the enumeration and an array of strings.
kernel_read_file_id_str() returns a string based on the enumeration.

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
[kees: removed removal of my old version, constified pointer values]
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/fs.h | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..90477550b935 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2580,15 +2580,34 @@ static inline void i_readcount_inc(struct inode *inode)
 #endif
 extern int do_pipe_flags(int *, int);
 
+#define __kernel_read_file_id(id) \
+	id(UNKNOWN, unknown)		\
+	id(FIRMWARE, firmware)		\
+	id(MODULE, kernel-module)		\
+	id(KEXEC_IMAGE, kexec-image)		\
+	id(KEXEC_INITRAMFS, kexec-initramfs)	\
+	id(POLICY, security-policy)		\
+	id(MAX_ID, )
+
+#define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
+#define __fid_stringify(dummy, str) #str,
+
 enum kernel_read_file_id {
-	READING_FIRMWARE = 1,
-	READING_MODULE,
-	READING_KEXEC_IMAGE,
-	READING_KEXEC_INITRAMFS,
-	READING_POLICY,
-	READING_MAX_ID
+	__kernel_read_file_id(__fid_enumify)
+};
+
+static const char * const kernel_read_file_str[] = {
+	__kernel_read_file_id(__fid_stringify)
 };
 
+static inline const char * const kernel_read_file_id_str(enum kernel_read_file_id id)
+{
+	if (id < 0 || id >= READING_MAX_ID)
+		return kernel_read_file_str[READING_UNKNOWN];
+
+	return kernel_read_file_str[id];
+}
+
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
 			    enum kernel_read_file_id);
-- 
cgit v1.2.3


From 9b091556a073a9f5f93e2ad23d118f45c4796a84 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:28 -0700
Subject: LSM: LoadPin for kernel file loading restrictions

This LSM enforces that kernel-loaded files (modules, firmware, etc)
must all come from the same filesystem, with the expectation that
such a filesystem is backed by a read-only device such as dm-verity
or CDROM. This allows systems that have a verified and/or unchangeable
filesystem to enforce module and firmware loading restrictions without
needing to sign the files individually.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 Documentation/security/LoadPin.txt |  17 ++++
 MAINTAINERS                        |   6 ++
 include/linux/lsm_hooks.h          |   5 +
 security/Kconfig                   |   1 +
 security/Makefile                  |   2 +
 security/loadpin/Kconfig           |  10 ++
 security/loadpin/Makefile          |   1 +
 security/loadpin/loadpin.c         | 190 +++++++++++++++++++++++++++++++++++++
 security/security.c                |   1 +
 9 files changed, 233 insertions(+)
 create mode 100644 Documentation/security/LoadPin.txt
 create mode 100644 security/loadpin/Kconfig
 create mode 100644 security/loadpin/Makefile
 create mode 100644 security/loadpin/loadpin.c

(limited to 'include/linux')

diff --git a/Documentation/security/LoadPin.txt b/Documentation/security/LoadPin.txt
new file mode 100644
index 000000000000..e11877f5d3d4
--- /dev/null
+++ b/Documentation/security/LoadPin.txt
@@ -0,0 +1,17 @@
+LoadPin is a Linux Security Module that ensures all kernel-loaded files
+(modules, firmware, etc) all originate from the same filesystem, with
+the expectation that such a filesystem is backed by a read-only device
+such as dm-verity or CDROM. This allows systems that have a verified
+and/or unchangeable filesystem to enforce module and firmware loading
+restrictions without needing to sign the files individually.
+
+The LSM is selectable at build-time with CONFIG_SECURITY_LOADPIN, and
+can be controlled at boot-time with the kernel command line option
+"loadpin.enabled". By default, it is enabled, but can be disabled at
+boot ("loadpin.enabled=0").
+
+LoadPin starts pinning when it sees the first file loaded. If the
+block device backing the filesystem is not read-only, a sysctl is
+created to toggle pinning: /proc/sys/kernel/loadpin/enabled. (Having
+a mutable filesystem means pinning is mutable too, but having the
+sysctl allows for easy testing on systems with a mutable filesystem.)
diff --git a/MAINTAINERS b/MAINTAINERS
index 1c32f8a3d6c4..b4b1e8179018 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9962,6 +9962,12 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
 S:	Supported
 F:	security/apparmor/
 
+LOADPIN SECURITY MODULE
+M:	Kees Cook <keescook@chromium.org>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/loadpin
+S:	Supported
+F:	security/loadpin/
+
 YAMA SECURITY MODULE
 M:	Kees Cook <keescook@chromium.org>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git yama/tip
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index ae2537886177..6e466fc0666c 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1892,5 +1892,10 @@ extern void __init yama_add_hooks(void);
 #else
 static inline void __init yama_add_hooks(void) { }
 #endif
+#ifdef CONFIG_SECURITY_LOADPIN
+void __init loadpin_add_hooks(void);
+#else
+static inline void loadpin_add_hooks(void) { };
+#endif
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/security/Kconfig b/security/Kconfig
index e45237897b43..176758cdfa57 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -122,6 +122,7 @@ source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig
 source security/apparmor/Kconfig
+source security/loadpin/Kconfig
 source security/yama/Kconfig
 
 source security/integrity/Kconfig
diff --git a/security/Makefile b/security/Makefile
index c9bfbc84ff50..f2d71cdb8e19 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -8,6 +8,7 @@ subdir-$(CONFIG_SECURITY_SMACK)		+= smack
 subdir-$(CONFIG_SECURITY_TOMOYO)        += tomoyo
 subdir-$(CONFIG_SECURITY_APPARMOR)	+= apparmor
 subdir-$(CONFIG_SECURITY_YAMA)		+= yama
+subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -22,6 +23,7 @@ obj-$(CONFIG_AUDIT)			+= lsm_audit.o
 obj-$(CONFIG_SECURITY_TOMOYO)		+= tomoyo/
 obj-$(CONFIG_SECURITY_APPARMOR)		+= apparmor/
 obj-$(CONFIG_SECURITY_YAMA)		+= yama/
+obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
 
 # Object integrity file lists
diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig
new file mode 100644
index 000000000000..c668ac4eda65
--- /dev/null
+++ b/security/loadpin/Kconfig
@@ -0,0 +1,10 @@
+config SECURITY_LOADPIN
+	bool "Pin load of kernel files (modules, fw, etc) to one filesystem"
+	depends on SECURITY && BLOCK
+	help
+	  Any files read through the kernel file reading interface
+	  (kernel modules, firmware, kexec images, security policy) will
+	  be pinned to the first filesystem used for loading. Any files
+	  that come from other filesystems will be rejected. This is best
+	  used on systems without an initrd that have a root filesystem
+	  backed by a read-only device such as dm-verity or a CDROM.
diff --git a/security/loadpin/Makefile b/security/loadpin/Makefile
new file mode 100644
index 000000000000..c2d77f83037b
--- /dev/null
+++ b/security/loadpin/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SECURITY_LOADPIN) += loadpin.o
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
new file mode 100644
index 000000000000..e4debae3c4d6
--- /dev/null
+++ b/security/loadpin/loadpin.c
@@ -0,0 +1,190 @@
+/*
+ * Module and Firmware Pinning Security Module
+ *
+ * Copyright 2011-2016 Google Inc.
+ *
+ * Author: Kees Cook <keescook@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "LoadPin: " fmt
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/lsm_hooks.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sched.h>	/* current */
+#include <linux/string_helpers.h>
+
+static void report_load(const char *origin, struct file *file, char *operation)
+{
+	char *cmdline, *pathname;
+
+	pathname = kstrdup_quotable_file(file, GFP_KERNEL);
+	cmdline = kstrdup_quotable_cmdline(current, GFP_KERNEL);
+
+	pr_notice("%s %s obj=%s%s%s pid=%d cmdline=%s%s%s\n",
+		  origin, operation,
+		  (pathname && pathname[0] != '<') ? "\"" : "",
+		  pathname,
+		  (pathname && pathname[0] != '<') ? "\"" : "",
+		  task_pid_nr(current),
+		  cmdline ? "\"" : "", cmdline, cmdline ? "\"" : "");
+
+	kfree(cmdline);
+	kfree(pathname);
+}
+
+static int enabled = 1;
+static struct super_block *pinned_root;
+static DEFINE_SPINLOCK(pinned_root_spinlock);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int one = 1;
+
+static struct ctl_path loadpin_sysctl_path[] = {
+	{ .procname = "kernel", },
+	{ .procname = "loadpin", },
+	{ }
+};
+
+static struct ctl_table loadpin_sysctl_table[] = {
+	{
+		.procname       = "enabled",
+		.data           = &enabled,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+	{ }
+};
+
+/*
+ * This must be called after early kernel init, since then the rootdev
+ * is available.
+ */
+static void check_pinning_enforcement(struct super_block *mnt_sb)
+{
+	bool ro = false;
+
+	/*
+	 * If load pinning is not enforced via a read-only block
+	 * device, allow sysctl to change modes for testing.
+	 */
+	if (mnt_sb->s_bdev) {
+		ro = bdev_read_only(mnt_sb->s_bdev);
+		pr_info("dev(%u,%u): %s\n",
+			MAJOR(mnt_sb->s_bdev->bd_dev),
+			MINOR(mnt_sb->s_bdev->bd_dev),
+			ro ? "read-only" : "writable");
+	} else
+		pr_info("mnt_sb lacks block device, treating as: writable\n");
+
+	if (!ro) {
+		if (!register_sysctl_paths(loadpin_sysctl_path,
+					   loadpin_sysctl_table))
+			pr_notice("sysctl registration failed!\n");
+		else
+			pr_info("load pinning can be disabled.\n");
+	} else
+		pr_info("load pinning engaged.\n");
+}
+#else
+static void check_pinning_enforcement(struct super_block *mnt_sb)
+{
+	pr_info("load pinning engaged.\n");
+}
+#endif
+
+static void loadpin_sb_free_security(struct super_block *mnt_sb)
+{
+	/*
+	 * When unmounting the filesystem we were using for load
+	 * pinning, we acknowledge the superblock release, but make sure
+	 * no other modules or firmware can be loaded.
+	 */
+	if (!IS_ERR_OR_NULL(pinned_root) && mnt_sb == pinned_root) {
+		pinned_root = ERR_PTR(-EIO);
+		pr_info("umount pinned fs: refusing further loads\n");
+	}
+}
+
+static int loadpin_read_file(struct file *file, enum kernel_read_file_id id)
+{
+	struct super_block *load_root;
+	const char *origin = kernel_read_file_id_str(id);
+
+	/* This handles the older init_module API that has a NULL file. */
+	if (!file) {
+		if (!enabled) {
+			report_load(origin, NULL, "old-api-pinning-ignored");
+			return 0;
+		}
+
+		report_load(origin, NULL, "old-api-denied");
+		return -EPERM;
+	}
+
+	load_root = file->f_path.mnt->mnt_sb;
+
+	/* First loaded module/firmware defines the root for all others. */
+	spin_lock(&pinned_root_spinlock);
+	/*
+	 * pinned_root is only NULL at startup. Otherwise, it is either
+	 * a valid reference, or an ERR_PTR.
+	 */
+	if (!pinned_root) {
+		pinned_root = load_root;
+		/*
+		 * Unlock now since it's only pinned_root we care about.
+		 * In the worst case, we will (correctly) report pinning
+		 * failures before we have announced that pinning is
+		 * enabled. This would be purely cosmetic.
+		 */
+		spin_unlock(&pinned_root_spinlock);
+		check_pinning_enforcement(pinned_root);
+		report_load(origin, file, "pinned");
+	} else {
+		spin_unlock(&pinned_root_spinlock);
+	}
+
+	if (IS_ERR_OR_NULL(pinned_root) || load_root != pinned_root) {
+		if (unlikely(!enabled)) {
+			report_load(origin, file, "pinning-ignored");
+			return 0;
+		}
+
+		report_load(origin, file, "denied");
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static struct security_hook_list loadpin_hooks[] = {
+	LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security),
+	LSM_HOOK_INIT(kernel_read_file, loadpin_read_file),
+};
+
+void __init loadpin_add_hooks(void)
+{
+	pr_info("ready to pin (currently %sabled)", enabled ? "en" : "dis");
+	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks));
+}
+
+/* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
+module_param(enabled, int, 0);
+MODULE_PARM_DESC(enabled, "Pin module/firmware loading (default: true)");
diff --git a/security/security.c b/security/security.c
index 554c3fb7d4a5..e42860899f23 100644
--- a/security/security.c
+++ b/security/security.c
@@ -60,6 +60,7 @@ int __init security_init(void)
 	 */
 	capability_add_hooks();
 	yama_add_hooks();
+	loadpin_add_hooks();
 
 	/*
 	 * Load all the remaining security modules.
-- 
cgit v1.2.3


From 92a39d9043ba5ff98adb1c31491f00c7bea5466e Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Wed, 23 Mar 2016 17:38:24 +0100
Subject: clk: composite: Add unregister function

The composite clock didn't have any unregistration function, which forced
us to use clk_unregister directly on it.

While it was already not great from an API point of view, it also meant
that we were leaking the clk_composite structure allocated in
clk_register_composite.

Add a clk_unregister_composite function to fix this.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-composite.c  | 15 +++++++++++++++
 include/linux/clk-provider.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 1f903e1f86a2..b0f3b84ebd13 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -286,3 +286,18 @@ err:
 	kfree(composite);
 	return clk;
 }
+
+void clk_unregister_composite(struct clk *clk)
+{
+	struct clk_composite *composite;
+	struct clk_hw *hw;
+
+	hw = __clk_get_hw(clk);
+	if (!hw)
+		return;
+
+	composite = to_clk_composite(hw);
+
+	clk_unregister(clk);
+	kfree(composite);
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index da95258127aa..26a8c9b7be71 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -603,6 +603,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
+void clk_unregister_composite(struct clk *clk);
 
 /***
  * struct clk_gpio_gate - gpio gated clock
-- 
cgit v1.2.3


From a7ab72390b77062420fb50e4451f71c9321aae05 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 20 Apr 2016 08:38:50 +0200
Subject: i2c: mux: add common data for every i2c-mux instance

All i2c-muxes have a parent adapter and one or many child
adapters. A mux also has some means of selection. Previously,
this was stored per child adapter, but it is only needed
to keep track of this per mux.

Add an i2c mux core, that keeps track of this consistently.

Also add some glue for users of the old interface, which will
create one implicit mux core per child adapter.

Signed-off-by: Peter Rosin <peda@axentia.se>
Tested-by: Antti Palosaari <crope@iki.fi>
Tested-by: Crestez Dan Leonard <leonard.crestez@intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-mux.c   | 175 ++++++++++++++++++++++++++++++++++++------------
 include/linux/i2c-mux.h |  34 ++++++++++
 2 files changed, 168 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index d4022878b2f0..5ce1b0704cb5 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -28,33 +28,34 @@
 #include <linux/slab.h>
 
 /* multiplexer per channel data */
+struct i2c_mux_priv_old {
+	void *mux_priv;
+	int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
+	int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
+};
+
 struct i2c_mux_priv {
 	struct i2c_adapter adap;
 	struct i2c_algorithm algo;
-
-	struct i2c_adapter *parent;
-	struct device *mux_dev;
-	void *mux_priv;
+	struct i2c_mux_core *muxc;
 	u32 chan_id;
-
-	int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
-	int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
 };
 
 static int i2c_mux_master_xfer(struct i2c_adapter *adap,
 			       struct i2c_msg msgs[], int num)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_adapter *parent = priv->parent;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
 	int ret;
 
 	/* Switch to the right mux port and perform the transfer. */
 
-	ret = priv->select(parent, priv->mux_priv, priv->chan_id);
+	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
 		ret = __i2c_transfer(parent, msgs, num);
-	if (priv->deselect)
-		priv->deselect(parent, priv->mux_priv, priv->chan_id);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
 
 	return ret;
 }
@@ -65,17 +66,18 @@ static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
 			      int size, union i2c_smbus_data *data)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_adapter *parent = priv->parent;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
 	int ret;
 
 	/* Select the right mux port and perform the transfer. */
 
-	ret = priv->select(parent, priv->mux_priv, priv->chan_id);
+	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
 		ret = parent->algo->smbus_xfer(parent, addr, flags,
 					read_write, command, size, data);
-	if (priv->deselect)
-		priv->deselect(parent, priv->mux_priv, priv->chan_id);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
 
 	return ret;
 }
@@ -84,7 +86,7 @@ static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
 static u32 i2c_mux_functionality(struct i2c_adapter *adap)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_adapter *parent = priv->parent;
+	struct i2c_adapter *parent = priv->muxc->parent;
 
 	return parent->algo->functionality(parent);
 }
@@ -102,6 +104,20 @@ static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
 	return class;
 }
 
+static int i2c_mux_select(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
+
+	return priv->select(muxc->parent, priv->mux_priv, chan);
+}
+
+static int i2c_mux_deselect(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
+
+	return priv->deselect(muxc->parent, priv->mux_priv, chan);
+}
+
 struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 				struct device *mux_dev,
 				void *mux_priv, u32 force_nr, u32 chan_id,
@@ -111,21 +127,77 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 				int (*deselect) (struct i2c_adapter *,
 						 void *, u32))
 {
+	struct i2c_mux_core *muxc;
+	struct i2c_mux_priv_old *priv;
+	int ret;
+
+	muxc = i2c_mux_alloc(parent, mux_dev, 1, sizeof(*priv), 0,
+			     i2c_mux_select, i2c_mux_deselect);
+	if (!muxc)
+		return NULL;
+
+	priv = i2c_mux_priv(muxc);
+	priv->select = select;
+	priv->deselect = deselect;
+	priv->mux_priv = mux_priv;
+
+	ret = i2c_mux_add_adapter(muxc, force_nr, chan_id, class);
+	if (ret) {
+		devm_kfree(mux_dev, muxc);
+		return NULL;
+	}
+
+	return muxc->adapter[0];
+}
+EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
+
+struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
+				   struct device *dev, int max_adapters,
+				   int sizeof_priv, u32 flags,
+				   int (*select)(struct i2c_mux_core *, u32),
+				   int (*deselect)(struct i2c_mux_core *, u32))
+{
+	struct i2c_mux_core *muxc;
+
+	muxc = devm_kzalloc(dev, sizeof(*muxc)
+			    + max_adapters * sizeof(muxc->adapter[0])
+			    + sizeof_priv, GFP_KERNEL);
+	if (!muxc)
+		return NULL;
+	if (sizeof_priv)
+		muxc->priv = &muxc->adapter[max_adapters];
+
+	muxc->parent = parent;
+	muxc->dev = dev;
+	muxc->select = select;
+	muxc->deselect = deselect;
+	muxc->max_adapters = max_adapters;
+
+	return muxc;
+}
+EXPORT_SYMBOL_GPL(i2c_mux_alloc);
+
+int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
+			u32 force_nr, u32 chan_id,
+			unsigned int class)
+{
+	struct i2c_adapter *parent = muxc->parent;
 	struct i2c_mux_priv *priv;
 	char symlink_name[20];
 	int ret;
 
-	priv = kzalloc(sizeof(struct i2c_mux_priv), GFP_KERNEL);
+	if (muxc->num_adapters >= muxc->max_adapters) {
+		dev_err(muxc->dev, "No room for more i2c-mux adapters\n");
+		return -EINVAL;
+	}
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
-		return NULL;
+		return -ENOMEM;
 
 	/* Set up private adapter data */
-	priv->parent = parent;
-	priv->mux_dev = mux_dev;
-	priv->mux_priv = mux_priv;
+	priv->muxc = muxc;
 	priv->chan_id = chan_id;
-	priv->select = select;
-	priv->deselect = deselect;
 
 	/* Need to do algo dynamically because we don't know ahead
 	 * of time what sort of physical adapter we'll be dealing with.
@@ -159,11 +231,11 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 	 * Try to populate the mux adapter's of_node, expands to
 	 * nothing if !CONFIG_OF.
 	 */
-	if (mux_dev->of_node) {
+	if (muxc->dev->of_node) {
 		struct device_node *child;
 		u32 reg;
 
-		for_each_child_of_node(mux_dev->of_node, child) {
+		for_each_child_of_node(muxc->dev->of_node, child) {
 			ret = of_property_read_u32(child, "reg", &reg);
 			if (ret)
 				continue;
@@ -177,8 +249,9 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 	/*
 	 * Associate the mux channel with an ACPI node.
 	 */
-	if (has_acpi_companion(mux_dev))
-		acpi_preset_companion(&priv->adap.dev, ACPI_COMPANION(mux_dev),
+	if (has_acpi_companion(muxc->dev))
+		acpi_preset_companion(&priv->adap.dev,
+				      ACPI_COMPANION(muxc->dev),
 				      chan_id);
 
 	if (force_nr) {
@@ -192,33 +265,53 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 			"failed to add mux-adapter (error=%d)\n",
 			ret);
 		kfree(priv);
-		return NULL;
+		return ret;
 	}
 
-	WARN(sysfs_create_link(&priv->adap.dev.kobj, &mux_dev->kobj, "mux_device"),
-			       "can't create symlink to mux device\n");
+	WARN(sysfs_create_link(&priv->adap.dev.kobj, &muxc->dev->kobj,
+			       "mux_device"),
+	     "can't create symlink to mux device\n");
 
 	snprintf(symlink_name, sizeof(symlink_name), "channel-%u", chan_id);
-	WARN(sysfs_create_link(&mux_dev->kobj, &priv->adap.dev.kobj, symlink_name),
-			       "can't create symlink for channel %u\n", chan_id);
+	WARN(sysfs_create_link(&muxc->dev->kobj, &priv->adap.dev.kobj,
+			       symlink_name),
+	     "can't create symlink for channel %u\n", chan_id);
 	dev_info(&parent->dev, "Added multiplexed i2c bus %d\n",
 		 i2c_adapter_id(&priv->adap));
 
-	return &priv->adap;
+	muxc->adapter[muxc->num_adapters++] = &priv->adap;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
+EXPORT_SYMBOL_GPL(i2c_mux_add_adapter);
 
-void i2c_del_mux_adapter(struct i2c_adapter *adap)
+void i2c_mux_del_adapters(struct i2c_mux_core *muxc)
 {
-	struct i2c_mux_priv *priv = adap->algo_data;
 	char symlink_name[20];
 
-	snprintf(symlink_name, sizeof(symlink_name), "channel-%u", priv->chan_id);
-	sysfs_remove_link(&priv->mux_dev->kobj, symlink_name);
+	while (muxc->num_adapters) {
+		struct i2c_adapter *adap = muxc->adapter[--muxc->num_adapters];
+		struct i2c_mux_priv *priv = adap->algo_data;
+
+		muxc->adapter[muxc->num_adapters] = NULL;
+
+		snprintf(symlink_name, sizeof(symlink_name),
+			 "channel-%u", priv->chan_id);
+		sysfs_remove_link(&muxc->dev->kobj, symlink_name);
+
+		sysfs_remove_link(&priv->adap.dev.kobj, "mux_device");
+		i2c_del_adapter(adap);
+		kfree(priv);
+	}
+}
+EXPORT_SYMBOL_GPL(i2c_mux_del_adapters);
+
+void i2c_del_mux_adapter(struct i2c_adapter *adap)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_mux_core *muxc = priv->muxc;
 
-	sysfs_remove_link(&priv->adap.dev.kobj, "mux_device");
-	i2c_del_adapter(adap);
-	kfree(priv);
+	i2c_mux_del_adapters(muxc);
+	devm_kfree(muxc->dev, muxc);
 }
 EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
 
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index b5f9a007a3ab..71ac1b3f4f68 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -27,6 +27,31 @@
 
 #ifdef __KERNEL__
 
+struct i2c_mux_core {
+	struct i2c_adapter *parent;
+	struct device *dev;
+
+	void *priv;
+
+	int (*select)(struct i2c_mux_core *, u32 chan_id);
+	int (*deselect)(struct i2c_mux_core *, u32 chan_id);
+
+	int num_adapters;
+	int max_adapters;
+	struct i2c_adapter *adapter[0];
+};
+
+struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
+				   struct device *dev, int max_adapters,
+				   int sizeof_priv, u32 flags,
+				   int (*select)(struct i2c_mux_core *, u32),
+				   int (*deselect)(struct i2c_mux_core *, u32));
+
+static inline void *i2c_mux_priv(struct i2c_mux_core *muxc)
+{
+	return muxc->priv;
+}
+
 /*
  * Called to create a i2c bus on a multiplexed bus segment.
  * The mux_dev and chan_id parameters are passed to the select
@@ -41,8 +66,17 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 					       void *mux_dev, u32 chan_id),
 				int (*deselect) (struct i2c_adapter *,
 						 void *mux_dev, u32 chan_id));
+/*
+ * Called to create an i2c bus on a multiplexed bus segment.
+ * The chan_id parameter is passed to the select and deselect
+ * callback functions to perform hardware-specific mux control.
+ */
+int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
+			u32 force_nr, u32 chan_id,
+			unsigned int class);
 
 void i2c_del_mux_adapter(struct i2c_adapter *adap);
+void i2c_mux_del_adapters(struct i2c_mux_core *muxc);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 23fe440c59b9f08afe108e7ec7b6714cb2a3b955 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 2 Mar 2016 15:14:22 +0100
Subject: i2c: mux: drop old unused i2c-mux api

All i2c mux users are using an explicit i2c mux core, drop support
for implicit i2c mux cores.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-mux.c   | 63 -------------------------------------------------
 include/linux/i2c-mux.h | 15 ------------
 2 files changed, 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index 5ce1b0704cb5..25e9336b0e6e 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -28,12 +28,6 @@
 #include <linux/slab.h>
 
 /* multiplexer per channel data */
-struct i2c_mux_priv_old {
-	void *mux_priv;
-	int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
-	int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
-};
-
 struct i2c_mux_priv {
 	struct i2c_adapter adap;
 	struct i2c_algorithm algo;
@@ -104,53 +98,6 @@ static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
 	return class;
 }
 
-static int i2c_mux_select(struct i2c_mux_core *muxc, u32 chan)
-{
-	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
-
-	return priv->select(muxc->parent, priv->mux_priv, chan);
-}
-
-static int i2c_mux_deselect(struct i2c_mux_core *muxc, u32 chan)
-{
-	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
-
-	return priv->deselect(muxc->parent, priv->mux_priv, chan);
-}
-
-struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
-				struct device *mux_dev,
-				void *mux_priv, u32 force_nr, u32 chan_id,
-				unsigned int class,
-				int (*select) (struct i2c_adapter *,
-					       void *, u32),
-				int (*deselect) (struct i2c_adapter *,
-						 void *, u32))
-{
-	struct i2c_mux_core *muxc;
-	struct i2c_mux_priv_old *priv;
-	int ret;
-
-	muxc = i2c_mux_alloc(parent, mux_dev, 1, sizeof(*priv), 0,
-			     i2c_mux_select, i2c_mux_deselect);
-	if (!muxc)
-		return NULL;
-
-	priv = i2c_mux_priv(muxc);
-	priv->select = select;
-	priv->deselect = deselect;
-	priv->mux_priv = mux_priv;
-
-	ret = i2c_mux_add_adapter(muxc, force_nr, chan_id, class);
-	if (ret) {
-		devm_kfree(mux_dev, muxc);
-		return NULL;
-	}
-
-	return muxc->adapter[0];
-}
-EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
-
 struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 				   struct device *dev, int max_adapters,
 				   int sizeof_priv, u32 flags,
@@ -305,16 +252,6 @@ void i2c_mux_del_adapters(struct i2c_mux_core *muxc)
 }
 EXPORT_SYMBOL_GPL(i2c_mux_del_adapters);
 
-void i2c_del_mux_adapter(struct i2c_adapter *adap)
-{
-	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_mux_core *muxc = priv->muxc;
-
-	i2c_mux_del_adapters(muxc);
-	devm_kfree(muxc->dev, muxc);
-}
-EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
-
 MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
 MODULE_DESCRIPTION("I2C driver for multiplexed I2C busses");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 71ac1b3f4f68..2fa93fe1345e 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -52,20 +52,6 @@ static inline void *i2c_mux_priv(struct i2c_mux_core *muxc)
 	return muxc->priv;
 }
 
-/*
- * Called to create a i2c bus on a multiplexed bus segment.
- * The mux_dev and chan_id parameters are passed to the select
- * and deselect callback functions to perform hardware-specific
- * mux control.
- */
-struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
-				struct device *mux_dev,
-				void *mux_priv, u32 force_nr, u32 chan_id,
-				unsigned int class,
-				int (*select) (struct i2c_adapter *,
-					       void *mux_dev, u32 chan_id),
-				int (*deselect) (struct i2c_adapter *,
-						 void *mux_dev, u32 chan_id));
 /*
  * Called to create an i2c bus on a multiplexed bus segment.
  * The chan_id parameter is passed to the select and deselect
@@ -75,7 +61,6 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 			u32 force_nr, u32 chan_id,
 			unsigned int class);
 
-void i2c_del_mux_adapter(struct i2c_adapter *adap);
 void i2c_mux_del_adapters(struct i2c_mux_core *muxc);
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 9d90725ddca347450c4ab177ad680ed76063afd4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 18 Mar 2016 11:27:36 -0700
Subject: libnvdimm, blk: move i/o infrastructure to nd_namespace_blk

Consolidate the information for issuing i/o to a blk-namespace, and
eliminate some pointer chasing.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/blk.c | 137 ++++++++++++++++++++++++++-------------------------
 include/linux/nd.h   |   2 +
 2 files changed, 71 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 26d039879ba2..4c14ecdc792b 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -21,17 +21,19 @@
 #include <linux/sizes.h>
 #include "nd.h"
 
-struct nd_blk_device {
-	struct nd_namespace_blk *nsblk;
-	struct nd_blk_region *ndbr;
-	size_t disk_size;
-	u32 sector_size;
-	u32 internal_lbasize;
-};
+static u32 nsblk_meta_size(struct nd_namespace_blk *nsblk)
+{
+	return nsblk->lbasize - ((nsblk->lbasize >= 4096) ? 4096 : 512);
+}
 
-static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev)
+static u32 nsblk_internal_lbasize(struct nd_namespace_blk *nsblk)
 {
-	return blk_dev->nsblk->lbasize - blk_dev->sector_size;
+	return roundup(nsblk->lbasize, INT_LBASIZE_ALIGNMENT);
+}
+
+static u32 nsblk_sector_size(struct nd_namespace_blk *nsblk)
+{
+	return nsblk->lbasize - nsblk_meta_size(nsblk);
 }
 
 static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
@@ -55,20 +57,29 @@ static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
 	return SIZE_MAX;
 }
 
+static struct nd_blk_region *to_ndbr(struct nd_namespace_blk *nsblk)
+{
+	struct nd_region *nd_region;
+	struct device *parent;
+
+	parent = nsblk->common.dev.parent;
+	nd_region = container_of(parent, struct nd_region, dev);
+	return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
-static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
-				struct bio_integrity_payload *bip, u64 lba,
-				int rw)
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, u64 lba, int rw)
 {
-	unsigned int len = nd_blk_meta_size(blk_dev);
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
+	unsigned int len = nsblk_meta_size(nsblk);
 	resource_size_t	dev_offset, ns_offset;
-	struct nd_namespace_blk *nsblk;
-	struct nd_blk_region *ndbr;
+	u32 internal_lbasize, sector_size;
 	int err = 0;
 
-	nsblk = blk_dev->nsblk;
-	ndbr = blk_dev->ndbr;
-	ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size;
+	internal_lbasize = nsblk_internal_lbasize(nsblk);
+	sector_size = nsblk_sector_size(nsblk);
+	ns_offset = lba * internal_lbasize + sector_size;
 	dev_offset = to_dev_offset(nsblk, ns_offset, len);
 	if (dev_offset == SIZE_MAX)
 		return -EIO;
@@ -102,25 +113,26 @@ static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
 }
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
-static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
-				struct bio_integrity_payload *bip, u64 lba,
-				int rw)
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, u64 lba, int rw)
 {
 	return 0;
 }
 #endif
 
-static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
-			struct bio_integrity_payload *bip, struct page *page,
-			unsigned int len, unsigned int off, int rw,
-			sector_t sector)
+static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, struct page *page,
+		unsigned int len, unsigned int off, int rw, sector_t sector)
 {
-	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
 	resource_size_t	dev_offset, ns_offset;
+	u32 internal_lbasize, sector_size;
 	int err = 0;
 	void *iobuf;
 	u64 lba;
 
+	internal_lbasize = nsblk_internal_lbasize(nsblk);
+	sector_size = nsblk_sector_size(nsblk);
 	while (len) {
 		unsigned int cur_len;
 
@@ -130,11 +142,11 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 		 * Block Window setup/move steps. the do_io routine is capable
 		 * of handling len <= PAGE_SIZE.
 		 */
-		cur_len = bip ? min(len, blk_dev->sector_size) : len;
+		cur_len = bip ? min(len, sector_size) : len;
 
-		lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size);
-		ns_offset = lba * blk_dev->internal_lbasize;
-		dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len);
+		lba = div_u64(sector << SECTOR_SHIFT, sector_size);
+		ns_offset = lba * internal_lbasize;
+		dev_offset = to_dev_offset(nsblk, ns_offset, cur_len);
 		if (dev_offset == SIZE_MAX)
 			return -EIO;
 
@@ -145,13 +157,13 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 			return err;
 
 		if (bip) {
-			err = nd_blk_rw_integrity(blk_dev, bip, lba, rw);
+			err = nd_blk_rw_integrity(nsblk, bip, lba, rw);
 			if (err)
 				return err;
 		}
 		len -= cur_len;
 		off += cur_len;
-		sector += blk_dev->sector_size >> SECTOR_SHIFT;
+		sector += sector_size >> SECTOR_SHIFT;
 	}
 
 	return err;
@@ -160,7 +172,7 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
-	struct nd_blk_device *blk_dev;
+	struct nd_namespace_blk *nsblk;
 	struct bvec_iter iter;
 	unsigned long start;
 	struct bio_vec bvec;
@@ -179,17 +191,17 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	bip = bio_integrity(bio);
-	blk_dev = q->queuedata;
+	nsblk = q->queuedata;
 	rw = bio_data_dir(bio);
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
 
 		BUG_ON(len > PAGE_SIZE);
-		err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len,
-					bvec.bv_offset, rw, iter.bi_sector);
+		err = nsblk_do_bvec(nsblk, bip, bvec.bv_page, len,
+				bvec.bv_offset, rw, iter.bi_sector);
 		if (err) {
-			dev_dbg(&blk_dev->nsblk->common.dev,
+			dev_dbg(&nsblk->common.dev,
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
@@ -205,17 +217,16 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
-static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
+static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
 		resource_size_t offset, void *iobuf, size_t n, int rw)
 {
-	struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim);
-	struct nd_namespace_blk *nsblk = blk_dev->nsblk;
-	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
 	resource_size_t	dev_offset;
 
 	dev_offset = to_dev_offset(nsblk, offset, n);
 
-	if (unlikely(offset + n > blk_dev->disk_size)) {
+	if (unlikely(offset + n > nsblk->size)) {
 		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
 		return -EFAULT;
 	}
@@ -242,16 +253,16 @@ static void nd_blk_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static int nd_blk_attach_disk(struct device *dev,
-		struct nd_namespace_common *ndns, struct nd_blk_device *blk_dev)
+static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
 {
+	struct device *dev = &nsblk->common.dev;
 	resource_size_t available_disk_size;
 	struct request_queue *q;
 	struct gendisk *disk;
 	u64 internal_nlba;
 
-	internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize);
-	available_disk_size = internal_nlba * blk_dev->sector_size;
+	internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
+	available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
 
 	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
@@ -264,9 +275,9 @@ static int nd_blk_attach_disk(struct device *dev,
 	blk_queue_make_request(q, nd_blk_make_request);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
-	blk_queue_logical_block_size(q, blk_dev->sector_size);
+	blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-	q->queuedata = blk_dev;
+	q->queuedata = nsblk;
 
 	disk = alloc_disk(0);
 	if (!disk)
@@ -276,17 +287,17 @@ static int nd_blk_attach_disk(struct device *dev,
 		return -ENOMEM;
 	}
 
-	disk->driverfs_dev	= &ndns->dev;
+	disk->driverfs_dev	= dev;
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
 	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
-	nvdimm_namespace_disk_name(ndns, disk->disk_name);
+	nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
 	set_capacity(disk, 0);
 	add_disk(disk);
 
-	if (nd_blk_meta_size(blk_dev)) {
-		int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev));
+	if (nsblk_meta_size(nsblk)) {
+		int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
 
 		if (rc)
 			return rc;
@@ -301,33 +312,23 @@ static int nd_blk_probe(struct device *dev)
 {
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_blk *nsblk;
-	struct nd_blk_device *blk_dev;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	blk_dev = devm_kzalloc(dev, sizeof(*blk_dev), GFP_KERNEL);
-	if (!blk_dev)
-		return -ENOMEM;
-
 	nsblk = to_nd_namespace_blk(&ndns->dev);
-	blk_dev->disk_size = nvdimm_namespace_capacity(ndns);
-	blk_dev->ndbr = to_nd_blk_region(dev->parent);
-	blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev);
-	blk_dev->internal_lbasize = roundup(nsblk->lbasize,
-						INT_LBASIZE_ALIGNMENT);
-	blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512);
-	dev_set_drvdata(dev, blk_dev);
-
-	ndns->rw_bytes = nd_blk_rw_bytes;
+	nsblk->size = nvdimm_namespace_capacity(ndns);
+	dev_set_drvdata(dev, nsblk);
+
+	ndns->rw_bytes = nsblk_rw_bytes;
 	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns, blk_dev) == 0) {
+	else if (nd_btt_probe(dev, ndns, nsblk) == 0) {
 		/* we'll come back as btt-blk */
 		return -ENXIO;
 	} else
-		return nd_blk_attach_disk(dev, ndns, blk_dev);
+		return nsblk_attach_disk(nsblk);
 }
 
 static int nd_blk_remove(struct device *dev)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 5489ab756d1a..5ea4aec7fd63 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -82,6 +82,7 @@ struct nd_namespace_pmem {
  * @uuid: namespace name supplied in the dimm label
  * @id: ida allocated id
  * @lbasize: blk namespaces have a native sector size when btt not present
+ * @size: sum of all the resource ranges allocated to this namespace
  * @num_resources: number of dpa extents to claim
  * @res: discontiguous dpa extents for given dimm
  */
@@ -91,6 +92,7 @@ struct nd_namespace_blk {
 	u8 *uuid;
 	int id;
 	unsigned long lbasize;
+	resource_size_t size;
 	int num_resources;
 	struct resource **res;
 };
-- 
cgit v1.2.3


From 200c79da824c978fcf6eec1dc9c0a1e521133267 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 Mar 2016 00:22:16 -0700
Subject: libnvdimm, pmem, pfn: make pmem_rw_bytes generic and refactor pfn
 setup

In preparation for providing an alternative (to block device) access
mechanism to persistent memory, convert pmem_rw_bytes() to
nsio_rw_bytes().  This allows ->rw_bytes() functionality without
requiring a 'struct pmem_device' to be instantiated.

In other words, when ->rw_bytes() is in use i/o is driven through
'struct nd_namespace_io', otherwise it is driven through 'struct
pmem_device' and the block layer.  This consolidates the disjoint calls
to devm_exit_badblocks() and devm_memunmap() into a common
devm_nsio_disable() and cleans up the init path to use a unified
pmem_attach_disk() implementation.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/blk.c              |   2 +-
 drivers/nvdimm/btt_devs.c         |   4 +-
 drivers/nvdimm/claim.c            |  61 ++++++++++
 drivers/nvdimm/nd.h               |  40 +++++--
 drivers/nvdimm/pfn_devs.c         |   4 +-
 drivers/nvdimm/pmem.c             | 236 ++++++++++++++------------------------
 include/linux/nd.h                |   9 +-
 tools/testing/nvdimm/Kbuild       |   1 +
 tools/testing/nvdimm/test/iomap.c |  27 +++--
 9 files changed, 211 insertions(+), 173 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 4c14ecdc792b..495e06d9f7e7 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -324,7 +324,7 @@ static int nd_blk_probe(struct device *dev)
 	ndns->rw_bytes = nsblk_rw_bytes;
 	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns, nsblk) == 0) {
+	else if (nd_btt_probe(dev, ndns) == 0) {
 		/* we'll come back as btt-blk */
 		return -ENXIO;
 	} else
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 1886171af80e..816d0dae6398 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -273,8 +273,7 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
 	return 0;
 }
 
-int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
 {
 	int rc;
 	struct device *btt_dev;
@@ -289,7 +288,6 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!btt_dev)
 		return -ENOMEM;
-	dev_set_drvdata(btt_dev, drvdata);
 	btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
 	rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
 	dev_dbg(dev, "%s: btt: %s\n", __func__,
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index e8f03b0e95e4..6bbd0a36994a 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -12,6 +12,7 @@
  */
 #include <linux/device.h>
 #include <linux/sizes.h>
+#include <linux/pmem.h>
 #include "nd-core.h"
 #include "pfn.h"
 #include "btt.h"
@@ -199,3 +200,63 @@ u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
 	return sum;
 }
 EXPORT_SYMBOL(nd_sb_checksum);
+
+static int nsio_rw_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size, int rw)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+	if (unlikely(offset + size > nsio->size)) {
+		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+		return -EFAULT;
+	}
+
+	if (rw == READ) {
+		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+
+		if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
+			return -EIO;
+		return memcpy_from_pmem(buf, nsio->addr + offset, size);
+	} else {
+		memcpy_to_pmem(nsio->addr + offset, buf, size);
+		wmb_pmem();
+	}
+
+	return 0;
+}
+
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+	struct nd_namespace_common *ndns = &nsio->common;
+
+	nsio->size = resource_size(res);
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				dev_name(dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", res);
+		return -EBUSY;
+	}
+
+	ndns->rw_bytes = nsio_rw_bytes;
+	if (devm_init_badblocks(dev, &nsio->bb))
+		return -ENOMEM;
+	nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb,
+			&nsio->res);
+
+	nsio->addr = devm_memremap(dev, res->start, resource_size(res),
+			ARCH_MEMREMAP_PMEM);
+	if (IS_ERR(nsio->addr))
+		return PTR_ERR(nsio->addr);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_nsio_enable);
+
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+
+	devm_memunmap(dev, nsio->addr);
+	devm_exit_badblocks(dev, &nsio->bb);
+	devm_release_mem_region(dev, res->start, resource_size(res));
+}
+EXPORT_SYMBOL_GPL(devm_nsio_disable);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 0fb14890ba26..10e23fe49012 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -13,6 +13,7 @@
 #ifndef __ND_H__
 #define __ND_H__
 #include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
 #include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
@@ -197,13 +198,12 @@ struct nd_gen_sb {
 
 u64 nd_sb_checksum(struct nd_gen_sb *sb);
 #if IS_ENABLED(CONFIG_BTT)
-int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata);
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_btt(struct device *dev);
 struct device *nd_btt_create(struct nd_region *nd_region);
 #else
 static inline int nd_btt_probe(struct device *dev,
-		struct nd_namespace_common *ndns, void *drvdata)
+		struct nd_namespace_common *ndns)
 {
 	return -ENODEV;
 }
@@ -221,14 +221,13 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
-int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata);
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
 int nd_pfn_validate(struct nd_pfn *nd_pfn);
 #else
-static inline int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+static inline int nd_pfn_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
 {
 	return -ENODEV;
 }
@@ -272,6 +271,20 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
 		struct badblocks *bb, const struct resource *res);
+#if IS_ENABLED(CONFIG_ND_CLAIM)
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
+#else
+static inline int devm_nsio_enable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+	return -ENXIO;
+}
+static inline void devm_nsio_disable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+}
+#endif
 int nd_blk_region_init(struct nd_region *nd_region);
 void __nd_iostat_start(struct bio *bio, unsigned long *start);
 static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
@@ -285,6 +298,19 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 	return true;
 }
 void nd_iostat_end(struct bio *bio, unsigned long start);
+static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
+		unsigned int len)
+{
+	if (bb->count) {
+		sector_t first_bad;
+		int num_bad;
+
+		return !!badblocks_check(bb, sector, len / 512, &first_bad,
+				&num_bad);
+	}
+
+	return false;
+}
 resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
 const u8 *nd_dev_to_uuid(struct device *dev);
 bool pmem_should_map_pages(struct device *dev);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 96aa5490c279..9df081ae96e3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -410,8 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 }
 EXPORT_SYMBOL(nd_pfn_validate);
 
-int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 {
 	int rc;
 	struct nd_pfn *nd_pfn;
@@ -427,7 +426,6 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!pfn_dev)
 		return -ENOMEM;
-	dev_set_drvdata(pfn_dev, drvdata);
 	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
 	nd_pfn = to_nd_pfn(pfn_dev);
 	nd_pfn->pfn_sb = pfn_sb;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 67d48e2e8ca2..b5f81b02205c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -49,19 +49,6 @@ struct pmem_device {
 	struct badblocks	bb;
 };
 
-static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
-{
-	if (bb->count) {
-		sector_t first_bad;
-		int num_bad;
-
-		return !!badblocks_check(bb, sector, len / 512, &first_bad,
-				&num_bad);
-	}
-
-	return false;
-}
-
 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 		unsigned int len)
 {
@@ -209,16 +196,40 @@ void pmem_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static struct pmem_device *pmem_alloc(struct device *dev,
-		struct resource *res, int id)
+static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap);
+
+static int pmem_attach_disk(struct device *dev,
+		struct nd_namespace_common *ndns)
 {
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct vmem_altmap __altmap, *altmap = NULL;
+	struct resource *res = &nsio->res;
+	struct nd_pfn *nd_pfn = NULL;
+	int nid = dev_to_node(dev);
+	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
+	struct resource pfn_res;
 	struct request_queue *q;
+	struct gendisk *disk;
+	void *addr;
+
+	/* while nsio_rw_bytes is active, parse a pfn info block if present */
+	if (is_nd_pfn(dev)) {
+		nd_pfn = to_nd_pfn(dev);
+		altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
+		if (IS_ERR(altmap))
+			return PTR_ERR(altmap);
+	}
+
+	/* we're attaching a block device, disable raw namespace access */
+	devm_nsio_disable(dev, nsio);
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
+	dev_set_drvdata(dev, pmem);
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
 	if (!arch_has_wmb_pmem())
@@ -227,22 +238,31 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
 				dev_name(dev))) {
 		dev_warn(dev, "could not reserve region %pR\n", res);
-		return ERR_PTR(-EBUSY);
+		return -EBUSY;
 	}
 
 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
 	if (!q)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
+	pmem->pmem_queue = q;
 
 	pmem->pfn_flags = PFN_DEV;
-	if (pmem_should_map_pages(dev)) {
-		pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
+	if (is_nd_pfn(dev)) {
+		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
+				altmap);
+		pfn_sb = nd_pfn->pfn_sb;
+		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+		pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
+		pmem->pfn_flags |= PFN_MAP;
+		res = &pfn_res; /* for badblocks populate */
+		res->start += pmem->data_offset;
+	} else if (pmem_should_map_pages(dev)) {
+		addr = devm_memremap_pages(dev, &nsio->res,
 				&q->q_usage_counter, NULL);
 		pmem->pfn_flags |= PFN_MAP;
 	} else
-		pmem->virt_addr = (void __pmem *) devm_memremap(dev,
-				pmem->phys_addr, pmem->size,
-				ARCH_MEMREMAP_PMEM);
+		addr = devm_memremap(dev, pmem->phys_addr,
+				pmem->size, ARCH_MEMREMAP_PMEM);
 
 	/*
 	 * At release time the queue must be dead before
@@ -250,23 +270,12 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	 */
 	if (devm_add_action(dev, pmem_release_queue, q)) {
 		blk_cleanup_queue(q);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	if (IS_ERR(pmem->virt_addr))
-		return (void __force *) pmem->virt_addr;
-
-	pmem->pmem_queue = q;
-	return pmem;
-}
-
-static int pmem_attach_disk(struct device *dev,
-		struct nd_namespace_common *ndns, struct pmem_device *pmem)
-{
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	int nid = dev_to_node(dev);
-	struct resource bb_res;
-	struct gendisk *disk;
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+	pmem->virt_addr = (void __pmem *) addr;
 
 	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
 	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
@@ -291,20 +300,9 @@ static int pmem_attach_disk(struct device *dev,
 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ 512);
 	pmem->pmem_disk = disk;
-	devm_exit_badblocks(dev, &pmem->bb);
 	if (devm_init_badblocks(dev, &pmem->bb))
 		return -ENOMEM;
-	bb_res.start = nsio->res.start + pmem->data_offset;
-	bb_res.end = nsio->res.end;
-	if (is_nd_pfn(dev)) {
-		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
-		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-
-		bb_res.start += __le32_to_cpu(pfn_sb->start_pad);
-		bb_res.end -= __le32_to_cpu(pfn_sb->end_trunc);
-	}
-	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb,
-			&bb_res);
+	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
 	disk->bb = &pmem->bb;
 	add_disk(disk);
 	revalidate_disk(disk);
@@ -312,33 +310,8 @@ static int pmem_attach_disk(struct device *dev,
 	return 0;
 }
 
-static int pmem_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size, int rw)
-{
-	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
-
-	if (unlikely(offset + size > pmem->size)) {
-		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
-		return -EFAULT;
-	}
-
-	if (rw == READ) {
-		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
-
-		if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
-			return -EIO;
-		return memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
-	} else {
-		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
-		wmb_pmem();
-	}
-
-	return 0;
-}
-
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
-	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	u32 start_pad = 0, end_trunc = 0;
 	resource_size_t start, size;
@@ -404,7 +377,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	 * ->direct_access() to those that are included in the memmap.
 	 */
 	start += start_pad;
-	npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K;
+	size = resource_size(&nsio->res);
+	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
 	if (nd_pfn->mode == PFN_MODE_PMEM)
 		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
 			- start;
@@ -413,13 +387,13 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	else
 		return -ENXIO;
 
-	if (offset + start_pad + end_trunc >= pmem->size) {
+	if (offset + start_pad + end_trunc >= size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
 		return -ENXIO;
 	}
 
-	npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -456,17 +430,14 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	return reserve;
 }
 
-static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
+static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
 {
-	struct resource res;
-	struct request_queue *q;
-	struct pmem_device *pmem;
-	struct vmem_altmap *altmap;
-	struct device *dev = &nd_pfn->dev;
 	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u64 offset = le64_to_cpu(pfn_sb->dataoff);
 	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
 	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	resource_size_t base = nsio->res.start + start_pad;
 	struct vmem_altmap __altmap = {
@@ -474,112 +445,75 @@ static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 		.reserve = init_altmap_reserve(base),
 	};
 
-	pmem = dev_get_drvdata(dev);
-	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
-	pmem->pfn_pad = start_pad + end_trunc;
+	memcpy(res, &nsio->res, sizeof(*res));
+	res->start += start_pad;
+	res->end -= end_trunc;
+
 	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
 	if (nd_pfn->mode == PFN_MODE_RAM) {
-		if (pmem->data_offset < SZ_8K)
-			return -EINVAL;
+		if (offset < SZ_8K)
+			return ERR_PTR(-EINVAL);
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 		altmap = NULL;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset)
-			/ PAGE_SIZE;
+		nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
 					le64_to_cpu(nd_pfn->pfn_sb->npfns),
 					nd_pfn->npfns);
-		altmap = & __altmap;
-		altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K);
+		memcpy(altmap, &__altmap, sizeof(*altmap));
+		altmap->free = PHYS_PFN(offset - SZ_8K);
 		altmap->alloc = 0;
 	} else
-		return -ENXIO;
+		return ERR_PTR(-ENXIO);
 
-	/* establish pfn range for lookup, and switch to direct map */
-	q = pmem->pmem_queue;
-	memcpy(&res, &nsio->res, sizeof(res));
-	res.start += start_pad;
-	res.end -= end_trunc;
-	devm_remove_action(dev, pmem_release_queue, q);
-	devm_memunmap(dev, (void __force *) pmem->virt_addr);
-	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res,
-			&q->q_usage_counter, altmap);
-	pmem->pfn_flags |= PFN_MAP;
-
-	/*
-	 * At release time the queue must be dead before
-	 * devm_memremap_pages is unwound
-	 */
-	if (devm_add_action(dev, pmem_release_queue, q)) {
-		blk_cleanup_queue(q);
-		return -ENOMEM;
-	}
-	if (IS_ERR(pmem->virt_addr))
-		return PTR_ERR(pmem->virt_addr);
-
-	/* attach pmem disk in "pfn-mode" */
-	return pmem_attach_disk(dev, ndns, pmem);
+	return altmap;
 }
 
-static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+/*
+ * Determine the effective resource range and vmem_altmap from an nd_pfn
+ * instance.
+ */
+static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
 	int rc;
 
 	if (!nd_pfn->uuid || !nd_pfn->ndns)
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
 	rc = nd_pfn_init(nd_pfn);
 	if (rc)
-		return rc;
+		return ERR_PTR(rc);
+
 	/* we need a valid pfn_sb before we can init a vmem_altmap */
-	return __nvdimm_namespace_attach_pfn(nd_pfn);
+	return __nvdimm_setup_pfn(nd_pfn, res, altmap);
 }
 
 static int nd_pmem_probe(struct device *dev)
 {
-	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct nd_namespace_common *ndns;
-	struct nd_namespace_io *nsio;
-	struct pmem_device *pmem;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	nsio = to_nd_namespace_io(&ndns->dev);
-	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
-	if (IS_ERR(pmem))
-		return PTR_ERR(pmem);
-
-	dev_set_drvdata(dev, pmem);
-	ndns->rw_bytes = pmem_rw_bytes;
-	if (devm_init_badblocks(dev, &pmem->bb))
-		return -ENOMEM;
-	nvdimm_badblocks_populate(nd_region, &pmem->bb, &nsio->res);
+	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
+		return -ENXIO;
 
-	if (is_nd_btt(dev)) {
-		/* btt allocates its own request_queue */
-		devm_remove_action(dev, pmem_release_queue, pmem->pmem_queue);
-		blk_cleanup_queue(pmem->pmem_queue);
+	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	}
 
 	if (is_nd_pfn(dev))
-		return nvdimm_namespace_attach_pfn(ndns);
+		return pmem_attach_disk(dev, ndns);
 
-	if (nd_btt_probe(dev, ndns, pmem) == 0
-			|| nd_pfn_probe(dev, ndns, pmem) == 0) {
-		/*
-		 * We'll come back as either btt-pmem, or pfn-pmem, so
-		 * drop the queue allocation for now.
-		 */
+	/* if we find a valid info-block we'll come back as that personality */
+	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)
 		return -ENXIO;
-	}
 
-	return pmem_attach_disk(dev, ndns, pmem);
+	/* ...otherwise we're just a raw pmem device */
+	return pmem_attach_disk(dev, ndns);
 }
 
 static int nd_pmem_remove(struct device *dev)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 5ea4aec7fd63..aee2761d294c 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/ndctl.h>
 #include <linux/device.h>
+#include <linux/badblocks.h>
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
@@ -55,13 +56,19 @@ static inline struct nd_namespace_common *to_ndns(struct device *dev)
 }
 
 /**
- * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
+ * struct nd_namespace_io - device representation of a persistent memory range
  * @dev: namespace device created by the nd region driver
  * @res: struct resource conversion of a NFIT SPA table
+ * @size: cached resource_size(@res) for fast path size checks
+ * @addr: virtual address to access the namespace range
+ * @bb: badblocks list for the namespace range
  */
 struct nd_namespace_io {
 	struct nd_namespace_common common;
 	struct resource res;
+	resource_size_t size;
+	void __pmem *addr;
+	struct badblocks bb;
 };
 
 /**
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index a34bfd0c8928..d5bc8c080b44 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -7,6 +7,7 @@ ldflags-y += --wrap=ioremap_nocache
 ldflags-y += --wrap=iounmap
 ldflags-y += --wrap=memunmap
 ldflags-y += --wrap=__devm_request_region
+ldflags-y += --wrap=__devm_release_region
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 ldflags-y += --wrap=devm_memremap_pages
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 0c1a7e65bb81..c842095f2801 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -239,13 +239,11 @@ struct resource *__wrap___devm_request_region(struct device *dev,
 }
 EXPORT_SYMBOL(__wrap___devm_request_region);
 
-void __wrap___release_region(struct resource *parent, resource_size_t start,
-				resource_size_t n)
+static bool nfit_test_release_region(struct resource *parent,
+		resource_size_t start, resource_size_t n)
 {
-	struct nfit_test_resource *nfit_res;
-
 	if (parent == &iomem_resource) {
-		nfit_res = get_nfit_res(start);
+		struct nfit_test_resource *nfit_res = get_nfit_res(start);
 		if (nfit_res) {
 			struct resource *res = nfit_res->res + 1;
 
@@ -254,11 +252,26 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
 						__func__, start, n, res);
 			else
 				memset(res, 0, sizeof(*res));
-			return;
+			return true;
 		}
 	}
-	__release_region(parent, start, n);
+	return false;
+}
+
+void __wrap___release_region(struct resource *parent, resource_size_t start,
+		resource_size_t n)
+{
+	if (!nfit_test_release_region(parent, start, n))
+		__release_region(parent, start, n);
 }
 EXPORT_SYMBOL(__wrap___release_region);
 
+void __wrap___devm_release_region(struct device *dev, struct resource *parent,
+		resource_size_t start, resource_size_t n)
+{
+	if (!nfit_test_release_region(parent, start, n))
+		__devm_release_region(dev, parent, start, n);
+}
+EXPORT_SYMBOL(__wrap___devm_release_region);
+
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From dfc57732ad38f93ae6232a3b4e64fd077383a0f1 Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 20 Apr 2016 19:23:43 +0200
Subject: iio:core: mounting matrix support

Expose a rotation matrix to indicate userspace the chip placement with
respect to the overall hardware system. This is needed to adjust
coordinates sampled from a sensor chip when its position deviates from the
main hardware system.

Final coordinates computation is delegated to userspace since:
* computation may involve floating point arithmetics ;
* it allows an application to combine adjustments with arbitrary
  transformations.

This 3 dimentional space rotation matrix is expressed as 3x3 array of
strings to support floating point numbers. It may be retrieved from a
"[<dir>_][<type>_]mount_matrix" sysfs attribute file. It is declared into a
device / driver specific DTS property or platform data.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 51 ++++++++++++++++++++
 drivers/iio/industrialio-core.c         | 82 +++++++++++++++++++++++++++++++++
 include/linux/iio/iio.h                 | 31 +++++++++++++
 3 files changed, 164 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index f155eff910f9..ba8df69d40b0 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1512,3 +1512,54 @@ Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw (unscaled no offset etc.) pH reading of a substance as a negative
 		base-10 logarithm of hydrodium ions in a litre of water.
+
+What:           /sys/bus/iio/devices/iio:deviceX/mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/in_mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/out_mount_matrix
+KernelVersion:  4.6
+Contact:        linux-iio@vger.kernel.org
+Description:
+		Mounting matrix for IIO sensors. This is a rotation matrix which
+		informs userspace about sensor chip's placement relative to the
+		main hardware it is mounted on.
+		Main hardware placement is defined according to the local
+		reference frame related to the physical quantity the sensor
+		measures.
+		Given that the rotation matrix is defined in a board specific
+		way (platform data and / or device-tree), the main hardware
+		reference frame definition is left to the implementor's choice
+		(see below for a magnetometer example).
+		Applications should apply this rotation matrix to samples so
+		that when main hardware reference frame is aligned onto local
+		reference frame, then sensor chip reference frame is also
+		perfectly aligned with it.
+		Matrix is a 3x3 unitary matrix and typically looks like
+		[0, 1, 0; 1, 0, 0; 0, 0, -1]. Identity matrix
+		[1, 0, 0; 0, 1, 0; 0, 0, 1] means sensor chip and main hardware
+		are perfectly aligned with each other.
+
+		For example, a mounting matrix for a magnetometer sensor informs
+		userspace about sensor chip's ORIENTATION relative to the main
+		hardware.
+		More specifically, main hardware orientation is defined with
+		respect to the LOCAL EARTH GEOMAGNETIC REFERENCE FRAME where :
+		* Y is in the ground plane and positive towards magnetic North ;
+		* X is in the ground plane, perpendicular to the North axis and
+		  positive towards the East ;
+		* Z is perpendicular to the ground plane and positive upwards.
+
+		An implementor might consider that for a hand-held device, a
+		'natural' orientation would be 'front facing camera at the top'.
+		The main hardware reference frame could then be described as :
+		* Y is in the plane of the screen and is positive towards the
+		  top of the screen ;
+		* X is in the plane of the screen, perpendicular to Y axis, and
+		  positive towards the right hand side of the screen ;
+		* Z is perpendicular to the screen plane and positive out of the
+		  screen.
+		Another example for a quadrotor UAV might be :
+		* Y is in the plane of the propellers and positive towards the
+		  front-view camera;
+		* X is in the plane of the propellers, perpendicular to Y axis,
+		  and positive towards the starboard side of the UAV ;
+		* Z is perpendicular to propellers plane and positive upwards.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 190a5939fd8c..e6319a9346b2 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -412,6 +412,88 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(iio_enum_write);
 
+static const struct iio_mount_matrix iio_mount_idmatrix = {
+	.rotation = {
+		"1", "0", "0",
+		"0", "1", "0",
+		"0", "0", "1"
+	}
+};
+
+static int iio_setup_mount_idmatrix(const struct device *dev,
+				    struct iio_mount_matrix *matrix)
+{
+	*matrix = iio_mount_idmatrix;
+	dev_info(dev, "mounting matrix not found: using identity...\n");
+	return 0;
+}
+
+ssize_t iio_show_mount_matrix(struct iio_dev *indio_dev, uintptr_t priv,
+			      const struct iio_chan_spec *chan, char *buf)
+{
+	const struct iio_mount_matrix *mtx = ((iio_get_mount_matrix_t *)
+					      priv)(indio_dev, chan);
+
+	if (IS_ERR(mtx))
+		return PTR_ERR(mtx);
+
+	if (!mtx)
+		mtx = &iio_mount_idmatrix;
+
+	return snprintf(buf, PAGE_SIZE, "%s, %s, %s; %s, %s, %s; %s, %s, %s\n",
+			mtx->rotation[0], mtx->rotation[1], mtx->rotation[2],
+			mtx->rotation[3], mtx->rotation[4], mtx->rotation[5],
+			mtx->rotation[6], mtx->rotation[7], mtx->rotation[8]);
+}
+EXPORT_SYMBOL_GPL(iio_show_mount_matrix);
+
+/**
+ * of_iio_read_mount_matrix() - retrieve iio device mounting matrix from
+ *                              device-tree "mount-matrix" property
+ * @dev:	device the mounting matrix property is assigned to
+ * @propname:	device specific mounting matrix property name
+ * @matrix:	where to store retrieved matrix
+ *
+ * If device is assigned no mounting matrix property, a default 3x3 identity
+ * matrix will be filled in.
+ *
+ * Return: 0 if success, or a negative error code on failure.
+ */
+#ifdef CONFIG_OF
+int of_iio_read_mount_matrix(const struct device *dev,
+			     const char *propname,
+			     struct iio_mount_matrix *matrix)
+{
+	if (dev->of_node) {
+		int err = of_property_read_string_array(dev->of_node,
+				propname, matrix->rotation,
+				ARRAY_SIZE(iio_mount_idmatrix.rotation));
+
+		if (err == ARRAY_SIZE(iio_mount_idmatrix.rotation))
+			return 0;
+
+		if (err >= 0)
+			/* Invalid number of matrix entries. */
+			return -EINVAL;
+
+		if (err != -EINVAL)
+			/* Invalid matrix declaration format. */
+			return err;
+	}
+
+	/* Matrix was not declared at all: fallback to identity. */
+	return iio_setup_mount_idmatrix(dev, matrix);
+}
+#else
+int of_iio_read_mount_matrix(const struct device *dev,
+			     const char *propname,
+			     struct iio_mount_matrix *matrix)
+{
+	return iio_setup_mount_idmatrix(dev, matrix);
+}
+#endif
+EXPORT_SYMBOL(of_iio_read_mount_matrix);
+
 /**
  * iio_format_value() - Formats a IIO value into its string representation
  * @buf:	The buffer to which the formatted value gets written
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 0b2773ada0ba..7c29cb0124ae 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -147,6 +147,37 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev,
 	.private = (uintptr_t)(_e), \
 }
 
+/**
+ * struct iio_mount_matrix - iio mounting matrix
+ * @rotation: 3 dimensional space rotation matrix defining sensor alignment with
+ *            main hardware
+ */
+struct iio_mount_matrix {
+	const char *rotation[9];
+};
+
+ssize_t iio_show_mount_matrix(struct iio_dev *indio_dev, uintptr_t priv,
+			      const struct iio_chan_spec *chan, char *buf);
+int of_iio_read_mount_matrix(const struct device *dev, const char *propname,
+			     struct iio_mount_matrix *matrix);
+
+typedef const struct iio_mount_matrix *
+	(iio_get_mount_matrix_t)(const struct iio_dev *indio_dev,
+				 const struct iio_chan_spec *chan);
+
+/**
+ * IIO_MOUNT_MATRIX() - Initialize mount matrix extended channel attribute
+ * @_shared:	Whether the attribute is shared between all channels
+ * @_get:	Pointer to an iio_get_mount_matrix_t accessor
+ */
+#define IIO_MOUNT_MATRIX(_shared, _get) \
+{ \
+	.name = "mount_matrix", \
+	.shared = (_shared), \
+	.read = iio_show_mount_matrix, \
+	.private = (uintptr_t)(_get), \
+}
+
 /**
  * struct iio_event_spec - specification for a channel event
  * @type:		    Type of the event
-- 
cgit v1.2.3


From 97eacb9166f4810368e180073dcbceeff0de34df Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 20 Apr 2016 19:23:44 +0200
Subject: iio:ak8975: add mounting matrix support

Expose a rotation matrix to indicate userspace the chip orientation with
respect to the overall hardware system.
Matrix is retrieved from "in_mount_matrix". It is declared into ak8975 DTS
entry as a "mount-matrix" property.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 .../bindings/iio/magnetometer/ak8975.txt           | 10 +++++++
 drivers/iio/magnetometer/ak8975.c                  | 34 +++++++++++++++++++---
 include/linux/iio/magnetometer/ak8975.h            | 16 ++++++++++
 3 files changed, 56 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/iio/magnetometer/ak8975.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
index 34a3206eefdf..e1e7dd3259f6 100644
--- a/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
+++ b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
@@ -9,6 +9,7 @@ Optional properties:
 
   - gpios : should be device tree identifier of the magnetometer DRDY pin
   - vdd-supply: an optional regulator that needs to be on to provide VDD
+  - mount-matrix: an optional 3x3 mounting rotation matrix
 
 Example:
 
@@ -17,4 +18,13 @@ ak8975@0c {
         reg = <0x0c>;
         gpios = <&gpj0 7 0>;
         vdd-supply = <&ldo_3v3_gnss>;
+        mount-matrix = "-0.984807753012208",  /* x0 */
+                       "0",                   /* y0 */
+                       "-0.173648177666930",  /* z0 */
+                       "0",                   /* x1 */
+                       "-1",                  /* y1 */
+                       "0",                   /* z1 */
+                       "-0.173648177666930",  /* x2 */
+                       "0",                   /* y2 */
+                       "0.984807753012208";   /* z2 */
 };
diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c
index a2aac50a0149..dbf066129a04 100644
--- a/drivers/iio/magnetometer/ak8975.c
+++ b/drivers/iio/magnetometer/ak8975.c
@@ -40,7 +40,8 @@
 #include <linux/iio/trigger.h>
 #include <linux/iio/trigger_consumer.h>
 #include <linux/iio/triggered_buffer.h>
-#include <linux/regulator/consumer.h>
+
+#include <linux/iio/magnetometer/ak8975.h>
 
 /*
  * Register definitions, as well as various shifts and masks to get at the
@@ -376,6 +377,7 @@ struct ak8975_data {
 	wait_queue_head_t	data_ready_queue;
 	unsigned long		flags;
 	u8			cntl_cache;
+	struct iio_mount_matrix orientation;
 	struct regulator	*vdd;
 };
 
@@ -726,6 +728,18 @@ static int ak8975_read_raw(struct iio_dev *indio_dev,
 	return -EINVAL;
 }
 
+static const struct iio_mount_matrix *
+ak8975_get_mount_matrix(const struct iio_dev *indio_dev,
+			const struct iio_chan_spec *chan)
+{
+	return &((struct ak8975_data *)iio_priv(indio_dev))->orientation;
+}
+
+static const struct iio_chan_spec_ext_info ak8975_ext_info[] = {
+	IIO_MOUNT_MATRIX(IIO_SHARED_BY_DIR, ak8975_get_mount_matrix),
+	{ },
+};
+
 #define AK8975_CHANNEL(axis, index)					\
 	{								\
 		.type = IIO_MAGN,					\
@@ -740,7 +754,8 @@ static int ak8975_read_raw(struct iio_dev *indio_dev,
 			.realbits = 16,					\
 			.storagebits = 16,				\
 			.endianness = IIO_CPU				\
-		}							\
+		},							\
+		.ext_info = ak8975_ext_info,				\
 	}
 
 static const struct iio_chan_spec ak8975_channels[] = {
@@ -837,10 +852,12 @@ static int ak8975_probe(struct i2c_client *client,
 	int err;
 	const char *name = NULL;
 	enum asahi_compass_chipset chipset;
+	const struct ak8975_platform_data *pdata =
+		dev_get_platdata(&client->dev);
 
 	/* Grab and set up the supplied GPIO. */
-	if (client->dev.platform_data)
-		eoc_gpio = *(int *)(client->dev.platform_data);
+	if (pdata)
+		eoc_gpio = pdata->eoc_gpio;
 	else if (client->dev.of_node)
 		eoc_gpio = of_get_gpio(client->dev.of_node, 0);
 	else
@@ -874,6 +891,15 @@ static int ak8975_probe(struct i2c_client *client,
 	data->eoc_gpio = eoc_gpio;
 	data->eoc_irq = 0;
 
+	if (!pdata) {
+		err = of_iio_read_mount_matrix(&client->dev,
+					       "mount-matrix",
+					       &data->orientation);
+		if (err)
+			return err;
+	} else
+		data->orientation = pdata->orientation;
+
 	/* id will be NULL when enumerated via ACPI */
 	if (id) {
 		chipset = (enum asahi_compass_chipset)(id->driver_data);
diff --git a/include/linux/iio/magnetometer/ak8975.h b/include/linux/iio/magnetometer/ak8975.h
new file mode 100644
index 000000000000..c8400959d197
--- /dev/null
+++ b/include/linux/iio/magnetometer/ak8975.h
@@ -0,0 +1,16 @@
+#ifndef __IIO_MAGNETOMETER_AK8975_H__
+#define __IIO_MAGNETOMETER_AK8975_H__
+
+#include <linux/iio/iio.h>
+
+/**
+ * struct ak8975_platform_data - AK8975 magnetometer driver platform data
+ * @eoc_gpio:    data ready event gpio
+ * @orientation: mounting matrix relative to main hardware
+ */
+struct ak8975_platform_data {
+	int                     eoc_gpio;
+	struct iio_mount_matrix orientation;
+};
+
+#endif
-- 
cgit v1.2.3


From eb3798463f71afc77abd25b2f62708be06f7173b Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 20 Apr 2016 19:23:45 +0200
Subject: iio:imu:mpu6050: enhance mounting matrix support

Add a new rotation matrix sysfs attribute compliant with IIO core
mounting matrix API.
Matrix is retrieved from "in_anglvel_mount_matrix" and
"in_accel_mount_matrix" sysfs attributes. It is declared into mpu6050 DTS
entry as a "mount-matrix" property.

Old interface is kept for backward userspace compatibility and may be
retrieved from legacy platform_data mechanism only.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio            |  2 ++
 .../devicetree/bindings/iio/imu/inv_mpu6050.txt    | 13 ++++++++
 drivers/iio/imu/inv_mpu6050/inv_mpu_core.c         | 36 ++++++++++++++++++++--
 drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h          |  4 ++-
 include/linux/platform_data/invensense_mpu6050.h   |  5 ++-
 5 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index ba8df69d40b0..df44998e7506 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1516,6 +1516,8 @@ Description:
 What:           /sys/bus/iio/devices/iio:deviceX/mount_matrix
 What:           /sys/bus/iio/devices/iio:deviceX/in_mount_matrix
 What:           /sys/bus/iio/devices/iio:deviceX/out_mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/in_anglvel_mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/in_accel_mount_matrix
 KernelVersion:  4.6
 Contact:        linux-iio@vger.kernel.org
 Description:
diff --git a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt
index e4d8f1c52f4a..a9fc11e43b45 100644
--- a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt
+++ b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt
@@ -8,10 +8,23 @@ Required properties:
  - interrupt-parent : should be the phandle for the interrupt controller
  - interrupts : interrupt mapping for GPIO IRQ
 
+Optional properties:
+ - mount-matrix: an optional 3x3 mounting rotation matrix
+
+
 Example:
 	mpu6050@68 {
 		compatible = "invensense,mpu6050";
 		reg = <0x68>;
 		interrupt-parent = <&gpio1>;
 		interrupts = <18 1>;
+		mount-matrix = "-0.984807753012208",  /* x0 */
+		               "0",                   /* y0 */
+		               "-0.173648177666930",  /* z0 */
+		               "0",                   /* x1 */
+		               "-1",                  /* y1 */
+		               "0",                   /* z1 */
+		               "-0.173648177666930",  /* x2 */
+		               "0",                   /* y2 */
+		               "0.984807753012208";   /* z2 */
 	};
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
index d192953e9a38..482a2490c53a 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
@@ -600,6 +600,10 @@ inv_fifo_rate_show(struct device *dev, struct device_attribute *attr,
 /**
  * inv_attr_show() - calling this function will show current
  *                    parameters.
+ *
+ * Deprecated in favor of IIO mounting matrix API.
+ *
+ * See inv_get_mount_matrix()
  */
 static ssize_t inv_attr_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
@@ -644,6 +648,18 @@ static int inv_mpu6050_validate_trigger(struct iio_dev *indio_dev,
 	return 0;
 }
 
+static const struct iio_mount_matrix *
+inv_get_mount_matrix(const struct iio_dev *indio_dev,
+		     const struct iio_chan_spec *chan)
+{
+	return &((struct inv_mpu6050_state *)iio_priv(indio_dev))->orientation;
+}
+
+static const struct iio_chan_spec_ext_info inv_ext_info[] = {
+	IIO_MOUNT_MATRIX(IIO_SHARED_BY_TYPE, inv_get_mount_matrix),
+	{ },
+};
+
 #define INV_MPU6050_CHAN(_type, _channel2, _index)                    \
 	{                                                             \
 		.type = _type,                                        \
@@ -660,6 +676,7 @@ static int inv_mpu6050_validate_trigger(struct iio_dev *indio_dev,
 				.shift = 0,                           \
 				.endianness = IIO_BE,                 \
 			     },                                       \
+		.ext_info = inv_ext_info,                             \
 	}
 
 static const struct iio_chan_spec inv_mpu_channels[] = {
@@ -692,14 +709,16 @@ static IIO_CONST_ATTR(in_accel_scale_available,
 					  "0.000598 0.001196 0.002392 0.004785");
 static IIO_DEV_ATTR_SAMP_FREQ(S_IRUGO | S_IWUSR, inv_fifo_rate_show,
 	inv_mpu6050_fifo_rate_store);
+
+/* Deprecated: kept for userspace backward compatibility. */
 static IIO_DEVICE_ATTR(in_gyro_matrix, S_IRUGO, inv_attr_show, NULL,
 	ATTR_GYRO_MATRIX);
 static IIO_DEVICE_ATTR(in_accel_matrix, S_IRUGO, inv_attr_show, NULL,
 	ATTR_ACCL_MATRIX);
 
 static struct attribute *inv_attributes[] = {
-	&iio_dev_attr_in_gyro_matrix.dev_attr.attr,
-	&iio_dev_attr_in_accel_matrix.dev_attr.attr,
+	&iio_dev_attr_in_gyro_matrix.dev_attr.attr,  /* deprecated */
+	&iio_dev_attr_in_accel_matrix.dev_attr.attr, /* deprecated */
 	&iio_dev_attr_sampling_frequency.dev_attr.attr,
 	&iio_const_attr_sampling_frequency_available.dev_attr.attr,
 	&iio_const_attr_in_accel_scale_available.dev_attr.attr,
@@ -779,9 +798,20 @@ int inv_mpu_core_probe(struct regmap *regmap, int irq, const char *name,
 	st->powerup_count = 0;
 	st->irq = irq;
 	st->map = regmap;
+
 	pdata = dev_get_platdata(dev);
-	if (pdata)
+	if (!pdata) {
+		result = of_iio_read_mount_matrix(dev, "mount-matrix",
+						  &st->orientation);
+		if (result) {
+			dev_err(dev, "Failed to retrieve mounting matrix %d\n",
+				result);
+			return result;
+		}
+	} else {
 		st->plat_data = *pdata;
+	}
+
 	/* power is turned on inside check chip type*/
 	result = inv_check_and_setup_chip(st);
 	if (result)
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
index e302a49703bf..52d60cdc9f16 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
@@ -114,7 +114,8 @@ struct inv_mpu6050_hw {
  *  @hw:		Other hardware-specific information.
  *  @chip_type:		chip type.
  *  @time_stamp_lock:	spin lock to time stamp.
- *  @plat_data:		platform data.
+ *  @plat_data:		platform data (deprecated in favor of @orientation).
+ *  @orientation:	sensor chip orientation relative to main hardware.
  *  @timestamps:        kfifo queue to store time stamp.
  *  @map		regmap pointer.
  *  @irq		interrupt number.
@@ -131,6 +132,7 @@ struct inv_mpu6050_state {
 	struct i2c_client *mux_client;
 	unsigned int powerup_count;
 	struct inv_mpu6050_platform_data plat_data;
+	struct iio_mount_matrix orientation;
 	DECLARE_KFIFO(timestamps, long long, TIMESTAMP_FIFO_SIZE);
 	struct regmap *map;
 	int irq;
diff --git a/include/linux/platform_data/invensense_mpu6050.h b/include/linux/platform_data/invensense_mpu6050.h
index ad3aa7b95f35..554b59801aa8 100644
--- a/include/linux/platform_data/invensense_mpu6050.h
+++ b/include/linux/platform_data/invensense_mpu6050.h
@@ -16,13 +16,16 @@
 
 /**
  * struct inv_mpu6050_platform_data - Platform data for the mpu driver
- * @orientation:	Orientation matrix of the chip
+ * @orientation:	Orientation matrix of the chip (deprecated in favor of
+ *			mounting matrix retrieved from device-tree)
  *
  * Contains platform specific information on how to configure the MPU6050 to
  * work on this platform.  The orientation matricies are 3x3 rotation matricies
  * that are applied to the data to rotate from the mounting orientation to the
  * platform orientation.  The values must be one of 0, 1, or -1 and each row and
  * column should have exactly 1 non-zero value.
+ *
+ * Deprecated in favor of mounting matrix retrieved from device-tree.
  */
 struct inv_mpu6050_platform_data {
 	__s8 orientation[9];
-- 
cgit v1.2.3


From 41617e1a8dec9fe082ba5dec26bacb154eb55482 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 24 Apr 2016 00:56:07 -0400
Subject: jbd2: add support for avoiding data writes during transaction commits

Currently when filesystem needs to make sure data is on permanent
storage before committing a transaction it adds inode to transaction's
inode list. During transaction commit, jbd2 writes back all dirty
buffers that have allocated underlying blocks and waits for the IO to
finish. However when doing writeback for delayed allocated data, we
allocate blocks and immediately submit the data. Thus asking jbd2 to
write dirty pages just unnecessarily adds more work to jbd2 possibly
writing back other redirtied blocks.

Add support to jbd2 to allow filesystem to ask jbd2 to only wait for
outstanding data writes before committing a transaction and thus avoid
unnecessary writes.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.h   |  3 ++-
 fs/jbd2/commit.c      |  4 ++++
 fs/jbd2/journal.c     |  3 ++-
 fs/jbd2/transaction.c | 22 ++++++++++++++++++----
 fs/ocfs2/journal.h    |  2 +-
 include/linux/jbd2.h  | 13 +++++++++++--
 6 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5f5846211095..f1c940b38b30 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -362,7 +362,8 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+		return jbd2_journal_inode_add_write(handle,
+						    EXT4_I(inode)->jinode);
 	return 0;
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2ad98d6e19f4..70078096117d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		if (!(jinode->i_flags & JI_WRITE_DATA))
+			continue;
 		mapping = jinode->i_vfs_inode->i_mapping;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		if (!(jinode->i_flags & JI_WAIT_DATA))
+			continue;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 435f0b26ac20..b31852f76f46 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 67c103867bf8..be56c8ca34c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 /*
  * File inode in the inode list of the handle's transaction
  */
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+				   unsigned long flags)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
@@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 	 * and if jinode->i_next_transaction == transaction, commit code
 	 * will only file the inode where we want it.
 	 */
-	if (jinode->i_transaction == transaction ||
-	    jinode->i_next_transaction == transaction)
+	if ((jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction) &&
+	    (jinode->i_flags & flags) == flags)
 		return 0;
 
 	spin_lock(&journal->j_list_lock);
-
+	jinode->i_flags |= flags;
+	/* Is inode already attached where we need it? */
 	if (jinode->i_transaction == transaction ||
 	    jinode->i_next_transaction == transaction)
 		goto done;
@@ -2523,6 +2526,17 @@ done:
 	return 0;
 }
 
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+	return jbd2_journal_file_inode(handle, jinode,
+				       JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
 /*
  * File truncate and transaction commit interact with each other in a
  * non-trivial way.  If a transaction writing data block A is
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index f4cd3c3e9fb7..497a4171ef61 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 
 static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+	return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fd1083c46c61..39511484ad10 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 
 /* Flags in jbd_inode->i_flags */
 #define __JI_COMMIT_RUNNING 0
-/* Commit of the inode data in progress. We use this flag to protect us from
+#define __JI_WRITE_DATA 1
+#define __JI_WAIT_DATA 2
+
+/*
+ * Commit of the inode data in progress. We use this flag to protect us from
  * concurrent deletion of inode. We cannot use reference to inode for this
  * since we cannot afford doing last iput() on behalf of kjournald
  */
 #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+/* Write allocated dirty buffers in this inode before commit */
+#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
+/* Wait for outstanding data writes for this inode before commit */
+#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
 
 /**
  * struct jbd_inode is the structure linking inodes in ordered mode
@@ -1270,7 +1278,8 @@ extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_force_commit_nested(journal_t *);
-extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
 extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
 				struct jbd2_inode *inode, loff_t new_size);
 extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
-- 
cgit v1.2.3


From 7ba2f2757d84eae533679306f03c93c118437a87 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 23 Apr 2016 22:47:08 +0200
Subject: spi: core: add hook flash_read_supported to spi_master

If hook spi_flash_read is implemented the fast flash read feature
is enabled for all devices attached to the respective master.

In most cases there is just one flash chip, however there are also
devices with more than one flash chip, namely some WiFi routers.
Then the fast flash read feature can be used for the first chip only.
OpenWRT implemented an own handling of this case, using controller_data
element of spi_device to hold the information whether fast flash read
can be used for a device.

This patch adds hook flash_read_supported to spi_master which is
used to extend spi_flash_read_supported() by checking whether the
fast flash read feature can be used for a specific spi_device.

If the hook is not implemented the default behavior is to allow
fast flash read for all devices (if spi_flash_read is implemented).

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 857a9a1d82b5..1f03483f61e5 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -372,6 +372,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @unprepare_message: undo any work done by prepare_message().
  * @spi_flash_read: to support spi-controller hardwares that provide
  *                  accelerated interface to read from flash devices.
+ * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *	number. Any individual value may be -ENOENT for CS lines that
  *	are not GPIOs (driven by the SPI controller itself).
@@ -529,6 +530,7 @@ struct spi_master {
 				 struct spi_message *message);
 	int (*spi_flash_read)(struct  spi_device *spi,
 			      struct spi_flash_read_message *msg);
+	bool (*flash_read_supported)(struct spi_device *spi);
 
 	/*
 	 * These hooks are for drivers that use a generic implementation
@@ -1158,7 +1160,9 @@ struct spi_flash_read_message {
 /* SPI core interface for flash read support */
 static inline bool spi_flash_read_supported(struct spi_device *spi)
 {
-	return spi->master->spi_flash_read ? true : false;
+	return spi->master->spi_flash_read &&
+	       (!spi->master->flash_read_supported ||
+	       spi->master->flash_read_supported(spi));
 }
 
 int spi_flash_read(struct spi_device *spi,
-- 
cgit v1.2.3


From 4899f78a3dccda41ffdaa1a2cbf78209753e0f70 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 6 Apr 2016 12:37:37 -0500
Subject: mailbox/omap: drop legacy platform device support

OMAP mailbox devices can no longer be created in legacy non-DT
mode, all the relevant code has been cleaned up. The OMAP mailbox
driver will only support devices created from DT going forward,
so drop the legacy platform device support from the driver.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/omap-mailbox.c             | 101 ++++++++++++-----------------
 include/linux/platform_data/mailbox-omap.h |  58 -----------------
 2 files changed, 41 insertions(+), 118 deletions(-)
 delete mode 100644 include/linux/platform_data/mailbox-omap.h

(limited to 'include/linux')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index b7f636f15cac..8754d810ef05 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -2,7 +2,7 @@
  * OMAP mailbox driver
  *
  * Copyright (C) 2006-2009 Nokia Corporation. All rights reserved.
- * Copyright (C) 2013-2014 Texas Instruments Inc.
+ * Copyright (C) 2013-2016 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
  *          Suman Anna <s-anna@ti.com>
@@ -33,7 +33,6 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
-#include <linux/platform_data/mailbox-omap.h>
 #include <linux/omap-mailbox.h>
 #include <linux/mailbox_controller.h>
 #include <linux/mailbox_client.h>
@@ -69,6 +68,10 @@
 #define MBOX_NR_REGS			(MBOX_REG_SIZE / sizeof(u32))
 #define OMAP4_MBOX_NR_REGS		(OMAP4_MBOX_REG_SIZE / sizeof(u32))
 
+/* Interrupt register configuration types */
+#define MBOX_INTR_CFG_TYPE1		0
+#define MBOX_INTR_CFG_TYPE2		1
+
 struct omap_mbox_fifo {
 	unsigned long msg;
 	unsigned long fifo_stat;
@@ -696,8 +699,6 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	int ret;
 	struct mbox_chan *chnls;
 	struct omap_mbox **list, *mbox, *mboxblk;
-	struct omap_mbox_pdata *pdata = pdev->dev.platform_data;
-	struct omap_mbox_dev_info *info = NULL;
 	struct omap_mbox_fifo_info *finfo, *finfoblk;
 	struct omap_mbox_device *mdev;
 	struct omap_mbox_fifo *fifo;
@@ -710,36 +711,26 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	u32 l;
 	int i;
 
-	if (!node && (!pdata || !pdata->info_cnt || !pdata->info)) {
-		pr_err("%s: platform not supported\n", __func__);
+	if (!node) {
+		pr_err("%s: only DT-based devices are supported\n", __func__);
 		return -ENODEV;
 	}
 
-	if (node) {
-		match = of_match_device(omap_mailbox_of_match, &pdev->dev);
-		if (!match)
-			return -ENODEV;
-		intr_type = (u32)match->data;
+	match = of_match_device(omap_mailbox_of_match, &pdev->dev);
+	if (!match)
+		return -ENODEV;
+	intr_type = (u32)match->data;
 
-		if (of_property_read_u32(node, "ti,mbox-num-users",
-					 &num_users))
-			return -ENODEV;
+	if (of_property_read_u32(node, "ti,mbox-num-users", &num_users))
+		return -ENODEV;
 
-		if (of_property_read_u32(node, "ti,mbox-num-fifos",
-					 &num_fifos))
-			return -ENODEV;
+	if (of_property_read_u32(node, "ti,mbox-num-fifos", &num_fifos))
+		return -ENODEV;
 
-		info_count = of_get_available_child_count(node);
-		if (!info_count) {
-			dev_err(&pdev->dev, "no available mbox devices found\n");
-			return -ENODEV;
-		}
-	} else { /* non-DT device creation */
-		info_count = pdata->info_cnt;
-		info = pdata->info;
-		intr_type = pdata->intr_type;
-		num_users = pdata->num_users;
-		num_fifos = pdata->num_fifos;
+	info_count = of_get_available_child_count(node);
+	if (!info_count) {
+		dev_err(&pdev->dev, "no available mbox devices found\n");
+		return -ENODEV;
 	}
 
 	finfoblk = devm_kzalloc(&pdev->dev, info_count * sizeof(*finfoblk),
@@ -750,38 +741,28 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	finfo = finfoblk;
 	child = NULL;
 	for (i = 0; i < info_count; i++, finfo++) {
-		if (node) {
-			child = of_get_next_available_child(node, child);
-			ret = of_property_read_u32_array(child, "ti,mbox-tx",
-							 tmp, ARRAY_SIZE(tmp));
-			if (ret)
-				return ret;
-			finfo->tx_id = tmp[0];
-			finfo->tx_irq = tmp[1];
-			finfo->tx_usr = tmp[2];
-
-			ret = of_property_read_u32_array(child, "ti,mbox-rx",
-							 tmp, ARRAY_SIZE(tmp));
-			if (ret)
-				return ret;
-			finfo->rx_id = tmp[0];
-			finfo->rx_irq = tmp[1];
-			finfo->rx_usr = tmp[2];
-
-			finfo->name = child->name;
-
-			if (of_find_property(child, "ti,mbox-send-noirq", NULL))
-				finfo->send_no_irq = true;
-		} else {
-			finfo->tx_id = info->tx_id;
-			finfo->rx_id = info->rx_id;
-			finfo->tx_usr = info->usr_id;
-			finfo->tx_irq = info->irq_id;
-			finfo->rx_usr = info->usr_id;
-			finfo->rx_irq = info->irq_id;
-			finfo->name = info->name;
-			info++;
-		}
+		child = of_get_next_available_child(node, child);
+		ret = of_property_read_u32_array(child, "ti,mbox-tx", tmp,
+						 ARRAY_SIZE(tmp));
+		if (ret)
+			return ret;
+		finfo->tx_id = tmp[0];
+		finfo->tx_irq = tmp[1];
+		finfo->tx_usr = tmp[2];
+
+		ret = of_property_read_u32_array(child, "ti,mbox-rx", tmp,
+						 ARRAY_SIZE(tmp));
+		if (ret)
+			return ret;
+		finfo->rx_id = tmp[0];
+		finfo->rx_irq = tmp[1];
+		finfo->rx_usr = tmp[2];
+
+		finfo->name = child->name;
+
+		if (of_find_property(child, "ti,mbox-send-noirq", NULL))
+			finfo->send_no_irq = true;
+
 		if (finfo->tx_id >= num_fifos || finfo->rx_id >= num_fifos ||
 		    finfo->tx_usr >= num_users || finfo->rx_usr >= num_users)
 			return -EINVAL;
diff --git a/include/linux/platform_data/mailbox-omap.h b/include/linux/platform_data/mailbox-omap.h
deleted file mode 100644
index 4631dbb4255e..000000000000
--- a/include/linux/platform_data/mailbox-omap.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * mailbox-omap.h
- *
- * Copyright (C) 2013 Texas Instruments, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _PLAT_MAILBOX_H
-#define _PLAT_MAILBOX_H
-
-/* Interrupt register configuration types */
-#define MBOX_INTR_CFG_TYPE1	(0)
-#define MBOX_INTR_CFG_TYPE2	(1)
-
-/**
- * struct omap_mbox_dev_info - OMAP mailbox device attribute info
- * @name:	name of the mailbox device
- * @tx_id:	mailbox queue id used for transmitting messages
- * @rx_id:	mailbox queue id on which messages are received
- * @irq_id:	irq identifier number to use from the hwmod data
- * @usr_id:	mailbox user id for identifying the interrupt into
- *			the MPU interrupt controller.
- */
-struct omap_mbox_dev_info {
-	const char *name;
-	u32 tx_id;
-	u32 rx_id;
-	u32 irq_id;
-	u32 usr_id;
-};
-
-/**
- * struct omap_mbox_pdata - OMAP mailbox platform data
- * @intr_type:	type of interrupt configuration registers used
-			while programming mailbox queue interrupts
- * @num_users:	number of users (processor devices) that the mailbox
- *			h/w block can interrupt
- * @num_fifos:	number of h/w fifos within the mailbox h/w block
- * @info_cnt:	number of mailbox devices for the platform
- * @info:	array of mailbox device attributes
- */
-struct omap_mbox_pdata {
-	u32 intr_type;
-	u32 num_users;
-	u32 num_fifos;
-	u32 info_cnt;
-	struct omap_mbox_dev_info *info;
-};
-
-#endif /* _PLAT_MAILBOX_H */
-- 
cgit v1.2.3


From dd28216528cf67339cd4f5854166fbd4eefd7fa8 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 6 Apr 2016 18:37:20 -0500
Subject: mailbox/omap: kill omap_mbox_{save/restore}_ctx() functions

The omap_mbox_save_ctx() and omap_mbox_restore_ctx() API were
previously provided to OMAP mailbox clients to save and restore
the mailbox context during system suspend/resume. The save and
restore functionality is now implemented through System PM driver
callbacks, and there is no need for these functions, so kill these
API.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/omap-mailbox.c | 51 ------------------------------------------
 include/linux/omap-mailbox.h   |  2 --
 2 files changed, 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index d8d3a4bc5262..c5e8b9cb170d 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -55,13 +55,6 @@
 #define MAILBOX_IRQ_NEWMSG(m)		(1 << (2 * (m)))
 #define MAILBOX_IRQ_NOTFULL(m)		(1 << (2 * (m) + 1))
 
-#define MBOX_REG_SIZE			0x120
-
-#define OMAP4_MBOX_REG_SIZE		0x130
-
-#define MBOX_NR_REGS			(MBOX_REG_SIZE / sizeof(u32))
-#define OMAP4_MBOX_NR_REGS		(OMAP4_MBOX_REG_SIZE / sizeof(u32))
-
 /* Interrupt register configuration types */
 #define MBOX_INTR_CFG_TYPE1		0
 #define MBOX_INTR_CFG_TYPE2		1
@@ -118,7 +111,6 @@ struct omap_mbox {
 	struct omap_mbox_device *parent;
 	struct omap_mbox_fifo	tx_fifo;
 	struct omap_mbox_fifo	rx_fifo;
-	u32			ctx[OMAP4_MBOX_NR_REGS];
 	u32			intr_type;
 	struct mbox_chan	*chan;
 	bool			send_no_irq;
@@ -209,49 +201,6 @@ static int is_mbox_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 	return (int)(enable & status & bit);
 }
 
-void omap_mbox_save_ctx(struct mbox_chan *chan)
-{
-	int i;
-	int nr_regs;
-	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
-
-	if (WARN_ON(!mbox))
-		return;
-
-	if (mbox->intr_type)
-		nr_regs = OMAP4_MBOX_NR_REGS;
-	else
-		nr_regs = MBOX_NR_REGS;
-	for (i = 0; i < nr_regs; i++) {
-		mbox->ctx[i] = mbox_read_reg(mbox->parent, i * sizeof(u32));
-
-		dev_dbg(mbox->dev, "%s: [%02x] %08x\n", __func__,
-			i, mbox->ctx[i]);
-	}
-}
-EXPORT_SYMBOL(omap_mbox_save_ctx);
-
-void omap_mbox_restore_ctx(struct mbox_chan *chan)
-{
-	int i;
-	int nr_regs;
-	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
-
-	if (WARN_ON(!mbox))
-		return;
-
-	if (mbox->intr_type)
-		nr_regs = OMAP4_MBOX_NR_REGS;
-	else
-		nr_regs = MBOX_NR_REGS;
-	for (i = 0; i < nr_regs; i++) {
-		mbox_write_reg(mbox->parent, mbox->ctx[i], i * sizeof(u32));
-		dev_dbg(mbox->dev, "%s: [%02x] %08x\n", __func__,
-			i, mbox->ctx[i]);
-	}
-}
-EXPORT_SYMBOL(omap_mbox_restore_ctx);
-
 static void _omap_mbox_enable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 {
 	u32 l;
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 587bbdd31f5a..c726bd833761 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -21,8 +21,6 @@ struct mbox_client;
 struct mbox_chan *omap_mbox_request_channel(struct mbox_client *cl,
 					    const char *chan_name);
 
-void omap_mbox_save_ctx(struct mbox_chan *chan);
-void omap_mbox_restore_ctx(struct mbox_chan *chan);
 void omap_mbox_enable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq);
 void omap_mbox_disable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq);
 
-- 
cgit v1.2.3


From db0a6fb5d97afe01fd9c47d37c6daa82d4d4001d Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 21 Apr 2016 14:14:01 -0400
Subject: audit: add tty field to LOGIN event

The tty field was missing from AUDIT_LOGIN events.

Refactor code to create a new function audit_get_tty(), using it to
replace the call in audit_log_task_info() and to add it to
audit_log_set_loginuid().  Lock and bump the kref to protect it, adding
audit_put_tty() alias to decrement it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 24 ++++++++++++++++++++++++
 kernel/audit.c        | 18 +++++-------------
 kernel/auditsc.c      |  8 ++++++--
 3 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b40ed5df5542..32cdafb312d8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
+#include <linux/tty.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
@@ -343,6 +344,23 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
+static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	struct tty_struct *tty = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	if (tsk->signal)
+		tty = tty_kref_get(tsk->signal->tty);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+	return tty;
+}
+
+static inline void audit_put_tty(struct tty_struct *tty)
+{
+	tty_kref_put(tty);
+}
+
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -500,6 +518,12 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
+static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	return NULL;
+}
+static inline void audit_put_tty(struct tty_struct *tty)
+{ }
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
diff --git a/kernel/audit.c b/kernel/audit.c
index f52fbefede09..384374a1d232 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -64,7 +64,6 @@
 #include <linux/security.h>
 #endif
 #include <linux/freezer.h>
-#include <linux/tty.h>
 #include <linux/pid_namespace.h>
 #include <net/netns/generic.h>
 
@@ -1871,21 +1870,14 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	const struct cred *cred;
 	char comm[sizeof(tsk->comm)];
-	char *tty;
+	struct tty_struct *tty;
 
 	if (!ab)
 		return;
 
 	/* tsk == current */
 	cred = current_cred();
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
-		tty = tsk->signal->tty->name;
-	else
-		tty = "(none)";
-	spin_unlock_irq(&tsk->sighand->siglock);
-
+	tty = audit_get_tty(tsk);
 	audit_log_format(ab,
 			 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
 			 " euid=%u suid=%u fsuid=%u"
@@ -1901,11 +1893,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 from_kgid(&init_user_ns, cred->egid),
 			 from_kgid(&init_user_ns, cred->sgid),
 			 from_kgid(&init_user_ns, cred->fsgid),
-			 tty, audit_get_sessionid(tsk));
-
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(tsk));
+	audit_put_tty(tty);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-
 	audit_log_d_path_exe(ab, tsk->mm);
 	audit_log_task_context(ab);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 195ffaee50b9..71e14d836e69 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1980,6 +1980,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 {
 	struct audit_buffer *ab;
 	uid_t uid, oldloginuid, loginuid;
+	struct tty_struct *tty;
 
 	if (!audit_enabled)
 		return;
@@ -1987,14 +1988,17 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	oldloginuid = from_kuid(&init_user_ns, koldloginuid);
 	loginuid = from_kuid(&init_user_ns, kloginuid),
+	tty = audit_get_tty(current);
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
 	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
-			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+			 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+			 oldsessionid, sessionid, !rc);
+	audit_put_tty(tty);
 	audit_log_end(ab);
 }
 
-- 
cgit v1.2.3


From dd80b54b18db3d76a43558daaa6ea3aa67f5aacd Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Wed, 20 Apr 2016 15:39:11 +0200
Subject: USB: LTM also for USB 3.1

LTM is also defined for SS+. The correct test is to check for anything
slower than SS not exactly SS.

Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 6a9a0c28415d..29aba76017ee 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -720,7 +720,7 @@ extern void usb_enable_ltm(struct usb_device *udev);
 
 static inline bool usb_device_supports_ltm(struct usb_device *udev)
 {
-	if (udev->speed != USB_SPEED_SUPER || !udev->bos || !udev->bos->ss_cap)
+	if (udev->speed < USB_SPEED_SUPER || !udev->bos || !udev->bos->ss_cap)
 		return false;
 	return udev->bos->ss_cap->bmAttributes & USB_LTM_SUPPORT;
 }
-- 
cgit v1.2.3


From fca504f6054f2a62d966afde23a0cfbf9e3dc32f Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Wed, 20 Apr 2016 15:39:12 +0200
Subject: USB: correct intervals for SS+

SS+ also expresses intervals in units of 125ms. Testing must
be for SS or faster, not SS exactly.

Signed-off-by: Oliver neukum <oneukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 29aba76017ee..7824f4557d50 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1569,7 +1569,7 @@ static inline void usb_fill_bulk_urb(struct urb *urb,
  * Initializes a interrupt urb with the proper information needed to submit
  * it to a device.
  *
- * Note that High Speed and SuperSpeed interrupt endpoints use a logarithmic
+ * Note that High Speed and SuperSpeed(+) interrupt endpoints use a logarithmic
  * encoding of the endpoint interval, and express polling intervals in
  * microframes (eight per millisecond) rather than in frames (one per
  * millisecond).
@@ -1595,7 +1595,7 @@ static inline void usb_fill_int_urb(struct urb *urb,
 	urb->complete = complete_fn;
 	urb->context = context;
 
-	if (dev->speed == USB_SPEED_HIGH || dev->speed == USB_SPEED_SUPER) {
+	if (dev->speed == USB_SPEED_HIGH || dev->speed >= USB_SPEED_SUPER) {
 		/* make sure interval is within allowed range */
 		interval = clamp(interval, 1, 16);
 
-- 
cgit v1.2.3


From dad56ee742a3abbb5d9e8108f8537d412bff3f57 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Apr 2016 21:22:22 -0400
Subject: tracing: Move event_trigger_unlock_commit{_regs}() to local header

The functions event_trigger_unlock_commit() and
event_trigger_unlock_commit_regs() are no longer used outside the tracing
system. Move them out of the generic headers and into the local one.

Along with __event_trigger_test_discard() that is only used by them.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 94 --------------------------------------------
 kernel/trace/trace.h         | 94 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5f89a5b0c7e6..70a181cb3585 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -452,100 +452,6 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 	return false;
 }
 
-/*
- * Helper function for event_trigger_unlock_commit{_regs}().
- * If there are event triggers attached to this event that requires
- * filtering against its fields, then they wil be called as the
- * entry already holds the field information of the current event.
- *
- * It also checks if the event should be discarded or not.
- * It is to be discarded if the event is soft disabled and the
- * event was only recorded to process triggers, or if the event
- * filter is active and this event did not match the filters.
- *
- * Returns true if the event is discarded, false otherwise.
- */
-static inline bool
-__event_trigger_test_discard(struct trace_event_file *file,
-			     struct ring_buffer *buffer,
-			     struct ring_buffer_event *event,
-			     void *entry,
-			     enum event_trigger_type *tt)
-{
-	unsigned long eflags = file->flags;
-
-	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
-		*tt = event_triggers_call(file, entry);
-
-	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
-		ring_buffer_discard_commit(buffer, event);
-	else if (!filter_check_discard(file, entry, buffer, event))
-		return false;
-
-	return true;
-}
-
-/**
- * event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- */
-static inline void
-event_trigger_unlock_commit(struct trace_event_file *file,
-			    struct ring_buffer *buffer,
-			    struct ring_buffer_event *event,
-			    void *entry, unsigned long irq_flags, int pc)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
-
-	if (tt)
-		event_triggers_post_call(file, tt, entry);
-}
-
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
-				 struct ring_buffer *buffer,
-				 struct ring_buffer_event *event,
-				 void *entry, unsigned long irq_flags, int pc,
-				 struct pt_regs *regs)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
-						irq_flags, pc, regs);
-
-	if (tt)
-		event_triggers_post_call(file, tt, entry);
-}
-
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
 #else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 727a3d28bce5..c0eac7b1e5a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,100 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct trace_event_file *file,
+			     struct ring_buffer *buffer,
+			     struct ring_buffer_event *event,
+			     void *entry,
+			     enum event_trigger_type *tt)
+{
+	unsigned long eflags = file->flags;
+
+	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
+		*tt = event_triggers_call(file, entry);
+
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
+		ring_buffer_discard_commit(buffer, event);
+	else if (!filter_check_discard(file, entry, buffer, event))
+		return false;
+
+	return true;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct trace_event_file *file,
+			    struct ring_buffer *buffer,
+			    struct ring_buffer_event *event,
+			    void *entry, unsigned long irq_flags, int pc)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct trace_event_file *file,
+				 struct ring_buffer *buffer,
+				 struct ring_buffer_event *event,
+				 void *entry, unsigned long irq_flags, int pc,
+				 struct pt_regs *regs)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
+						irq_flags, pc, regs);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
-- 
cgit v1.2.3


From da20dfe6b50ea4c1a82797b7ee8655a370535d73 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 21 Apr 2016 12:53:29 -0700
Subject: fs: fix over-zealous use of "const"

When I was fixing up const recommendations from checkpatch.pl, I went
overboard. This fixes the warning (during a W=1 build):

include/linux/fs.h:2627:74: warning: type qualifiers ignored on function return type [-Wignored-qualifiers]
static inline const char * const kernel_read_file_id_str(enum kernel_read_file_id id)

Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 90477550b935..9847d5c49a0e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2600,7 +2600,7 @@ static const char * const kernel_read_file_str[] = {
 	__kernel_read_file_id(__fid_stringify)
 };
 
-static inline const char * const kernel_read_file_id_str(enum kernel_read_file_id id)
+static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 {
 	if (id < 0 || id >= READING_MAX_ID)
 		return kernel_read_file_str[READING_UNKNOWN];
-- 
cgit v1.2.3


From 65da9a0a3bf2202c2432f42d41eb908f2fa30579 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2016 10:13:46 -0400
Subject: tracing: Make filter_check_discard() local

Nothing outside of the tracing directory calls filter_check_discard() or
check_filter_check_discard(). They should not be called by modules. Move
their prototypes into the local tracing header and remove their
EXPORT_SYMBOL() macros.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 6 ------
 kernel/trace/trace.c         | 2 --
 kernel/trace/trace.h         | 6 ++++++
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 70a181cb3585..bb383af35cc7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -413,12 +413,6 @@ enum event_trigger_type {
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
-extern int filter_check_discard(struct trace_event_file *file, void *rec,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event);
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event);
 extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
 						   void *rec);
 extern void event_triggers_post_call(struct trace_event_file *file,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46028d47d252..02f5a5f51d49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -318,7 +318,6 @@ int filter_check_discard(struct trace_event_file *file, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(filter_check_discard);
 
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
@@ -332,7 +331,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
 
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c0eac7b1e5a6..ee8691c66bfe 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,12 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+extern int filter_check_discard(struct trace_event_file *file, void *rec,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From 188e3c5cd2b672620291e64a21f1598fe91e40b6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 27 Apr 2016 11:56:04 +0200
Subject: tty: provide tty_name() even without CONFIG_TTY

The audit subsystem just started printing the name of the tty,
but that causes a build failure when CONFIG_TTY is disabled:

kernel/built-in.o: In function `audit_log_task_info':
memremap.c:(.text+0x5e34c): undefined reference to `tty_name'
kernel/built-in.o: In function `audit_set_loginuid':
memremap.c:(.text+0x63b34): undefined reference to `tty_name'

This adds tty_name() to the list of functions that are provided
as trivial stubs in that configuration.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: db0a6fb5d97a ("audit: add tty field to LOGIN event")
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/tty.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index d9fb4b043f56..a93cce297832 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -371,6 +371,7 @@ extern void proc_clear_tty(struct task_struct *p);
 extern struct tty_struct *get_current_tty(void);
 /* tty_io.c */
 extern int __init tty_init(void);
+extern const char *tty_name(const struct tty_struct *tty);
 #else
 static inline void console_init(void)
 { }
@@ -391,6 +392,8 @@ static inline struct tty_struct *get_current_tty(void)
 /* tty_io.c */
 static inline int __init tty_init(void)
 { return 0; }
+static inline const char *tty_name(const struct tty_struct *tty)
+{ return "(none)"; }
 #endif
 
 extern struct ktermios tty_std_termios;
@@ -415,7 +418,6 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 	return tty;
 }
 
-extern const char *tty_name(const struct tty_struct *tty);
 extern const char *tty_driver_name(const struct tty_struct *tty);
 extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
 extern int __tty_check_change(struct tty_struct *tty, int sig);
-- 
cgit v1.2.3


From feb26ac31a2a5cb88d86680d9a94916a6343e9e6 Mon Sep 17 00:00:00 2001
From: Chris Bainbridge <chris.bainbridge@gmail.com>
Date: Mon, 25 Apr 2016 13:48:38 +0100
Subject: usb: core: hub: hub_port_init lock controller instead of bus

The XHCI controller presents two USB buses to the system - one for USB2
and one for USB3. The hub init code (hub_port_init) is reentrant but
only locks one bus per thread, leading to a race condition failure when
two threads attempt to simultaneously initialise a USB2 and USB3 device:

[    8.034843] xhci_hcd 0000:00:14.0: Timeout while waiting for setup device command
[   13.183701] usb 3-3: device descriptor read/all, error -110

On a test system this failure occurred on 6% of all boots.

The call traces at the point of failure are:

Call Trace:
 [<ffffffff81b9bab7>] schedule+0x37/0x90
 [<ffffffff817da7cd>] usb_kill_urb+0x8d/0xd0
 [<ffffffff8111e5e0>] ? wake_up_atomic_t+0x30/0x30
 [<ffffffff817dafbe>] usb_start_wait_urb+0xbe/0x150
 [<ffffffff817db10c>] usb_control_msg+0xbc/0xf0
 [<ffffffff817d07de>] hub_port_init+0x51e/0xb70
 [<ffffffff817d4697>] hub_event+0x817/0x1570
 [<ffffffff810f3e6f>] process_one_work+0x1ff/0x620
 [<ffffffff810f3dcf>] ? process_one_work+0x15f/0x620
 [<ffffffff810f4684>] worker_thread+0x64/0x4b0
 [<ffffffff810f4620>] ? rescuer_thread+0x390/0x390
 [<ffffffff810fa7f5>] kthread+0x105/0x120
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200
 [<ffffffff81ba183f>] ret_from_fork+0x3f/0x70
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200

Call Trace:
 [<ffffffff817fd36d>] xhci_setup_device+0x53d/0xa40
 [<ffffffff817fd87e>] xhci_address_device+0xe/0x10
 [<ffffffff817d047f>] hub_port_init+0x1bf/0xb70
 [<ffffffff811247ed>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffff817d4697>] hub_event+0x817/0x1570
 [<ffffffff810f3e6f>] process_one_work+0x1ff/0x620
 [<ffffffff810f3dcf>] ? process_one_work+0x15f/0x620
 [<ffffffff810f4684>] worker_thread+0x64/0x4b0
 [<ffffffff810f4620>] ? rescuer_thread+0x390/0x390
 [<ffffffff810fa7f5>] kthread+0x105/0x120
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200
 [<ffffffff81ba183f>] ret_from_fork+0x3f/0x70
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200

Which results from the two call chains:

hub_port_init
 usb_get_device_descriptor
  usb_get_descriptor
   usb_control_msg
    usb_internal_control_msg
     usb_start_wait_urb
      usb_submit_urb / wait_for_completion_timeout / usb_kill_urb

hub_port_init
 hub_set_address
  xhci_address_device
   xhci_setup_device

Mathias Nyman explains the current behaviour violates the XHCI spec:

 hub_port_reset() will end up moving the corresponding xhci device slot
 to default state.

 As hub_port_reset() is called several times in hub_port_init() it
 sounds reasonable that we could end up with two threads having their
 xhci device slots in default state at the same time, which according to
 xhci 4.5.3 specs still is a big no no:

 "Note: Software shall not transition more than one Device Slot to the
  Default State at a time"

 So both threads fail at their next task after this.
 One fails to read the descriptor, and the other fails addressing the
 device.

Fix this in hub_port_init by locking the USB controller (instead of an
individual bus) to prevent simultaneous initialisation of both buses.

Fixes: 638139eb95d2 ("usb: hub: allow to process more usb hub events in parallel")
Link: https://lkml.org/lkml/2016/2/8/312
Link: https://lkml.org/lkml/2016/2/4/748
Signed-off-by: Chris Bainbridge <chris.bainbridge@gmail.com>
Cc: stable <stable@vger.kernel.org>
Acked-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 15 +++++++++++++--
 drivers/usb/core/hub.c  |  8 ++++----
 include/linux/usb.h     |  3 +--
 include/linux/usb/hcd.h |  1 +
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 2ca2cef7f681..980fc5774151 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -994,7 +994,7 @@ static void usb_bus_init (struct usb_bus *bus)
 	bus->bandwidth_allocated = 0;
 	bus->bandwidth_int_reqs  = 0;
 	bus->bandwidth_isoc_reqs = 0;
-	mutex_init(&bus->usb_address0_mutex);
+	mutex_init(&bus->devnum_next_mutex);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -2521,6 +2521,14 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 		return NULL;
 	}
 	if (primary_hcd == NULL) {
+		hcd->address0_mutex = kmalloc(sizeof(*hcd->address0_mutex),
+				GFP_KERNEL);
+		if (!hcd->address0_mutex) {
+			kfree(hcd);
+			dev_dbg(dev, "hcd address0 mutex alloc failed\n");
+			return NULL;
+		}
+		mutex_init(hcd->address0_mutex);
 		hcd->bandwidth_mutex = kmalloc(sizeof(*hcd->bandwidth_mutex),
 				GFP_KERNEL);
 		if (!hcd->bandwidth_mutex) {
@@ -2532,6 +2540,7 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 		dev_set_drvdata(dev, hcd);
 	} else {
 		mutex_lock(&usb_port_peer_mutex);
+		hcd->address0_mutex = primary_hcd->address0_mutex;
 		hcd->bandwidth_mutex = primary_hcd->bandwidth_mutex;
 		hcd->primary_hcd = primary_hcd;
 		primary_hcd->primary_hcd = primary_hcd;
@@ -2598,8 +2607,10 @@ static void hcd_release(struct kref *kref)
 	struct usb_hcd *hcd = container_of (kref, struct usb_hcd, kref);
 
 	mutex_lock(&usb_port_peer_mutex);
-	if (usb_hcd_is_primary_hcd(hcd))
+	if (usb_hcd_is_primary_hcd(hcd)) {
+		kfree(hcd->address0_mutex);
 		kfree(hcd->bandwidth_mutex);
+	}
 	if (hcd->shared_hcd) {
 		struct usb_hcd *peer = hcd->shared_hcd;
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index c2270d8fac12..bee13517676f 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2082,7 +2082,7 @@ static void choose_devnum(struct usb_device *udev)
 	struct usb_bus	*bus = udev->bus;
 
 	/* be safe when more hub events are proceed in parallel */
-	mutex_lock(&bus->usb_address0_mutex);
+	mutex_lock(&bus->devnum_next_mutex);
 	if (udev->wusb) {
 		devnum = udev->portnum + 1;
 		BUG_ON(test_bit(devnum, bus->devmap.devicemap));
@@ -2100,7 +2100,7 @@ static void choose_devnum(struct usb_device *udev)
 		set_bit(devnum, bus->devmap.devicemap);
 		udev->devnum = devnum;
 	}
-	mutex_unlock(&bus->usb_address0_mutex);
+	mutex_unlock(&bus->devnum_next_mutex);
 }
 
 static void release_devnum(struct usb_device *udev)
@@ -4366,7 +4366,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 	if (oldspeed == USB_SPEED_LOW)
 		delay = HUB_LONG_RESET_TIME;
 
-	mutex_lock(&hdev->bus->usb_address0_mutex);
+	mutex_lock(hcd->address0_mutex);
 
 	/* Reset the device; full speed may morph to high speed */
 	/* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
@@ -4652,7 +4652,7 @@ fail:
 		hub_port_disable(hub, port1, 0);
 		update_devnum(udev, devnum);	/* for disconnect processing */
 	}
-	mutex_unlock(&hdev->bus->usb_address0_mutex);
+	mutex_unlock(hcd->address0_mutex);
 	return retval;
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7824f4557d50..01b6c61cf9bb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -374,13 +374,12 @@ struct usb_bus {
 
 	int devnum_next;		/* Next open device number in
 					 * round-robin allocation */
+	struct mutex devnum_next_mutex; /* devnum_next mutex */
 
 	struct usb_devmap devmap;	/* device address allocation map */
 	struct usb_device *root_hub;	/* Root hub */
 	struct usb_bus *hs_companion;	/* Companion EHCI bus, if any */
 
-	struct mutex usb_address0_mutex; /* unaddressed device mutex */
-
 	int bandwidth_allocated;	/* on this bus: how much of the time
 					 * reserved for periodic (intr/iso)
 					 * requests is used, on average?
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index b98f831dcda3..66fc13705ab7 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -181,6 +181,7 @@ struct usb_hcd {
 	 * bandwidth_mutex should be dropped after a successful control message
 	 * to the device, or resetting the bandwidth after a failed attempt.
 	 */
+	struct mutex		*address0_mutex;
 	struct mutex		*bandwidth_mutex;
 	struct usb_hcd		*shared_hcd;
 	struct usb_hcd		*primary_hcd;
-- 
cgit v1.2.3


From 40abf9be8f52d440e442206182916e3dcc68f722 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Mon, 11 Apr 2016 15:02:28 -0700
Subject: libnvdimm: increase max envelope size for ioctl

nd_ioctl() must first read in the fixed sized portion of an ioctl so
that it can then determine the size of the variable part.

Prepare for ND_CMD_CALL calls which have larger fixed portion
envelope.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/libnvdimm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 833867b9ddc2..af31d1c6fdd7 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -27,7 +27,7 @@ enum {
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
 	ND_CMD_MAX_ELEM = 5,
-	ND_CMD_MAX_ENVELOPE = 16,
+	ND_CMD_MAX_ENVELOPE = 256,
 	ND_MAX_MAPPINGS = 32,
 
 	/* region flag indicating to direct-map persistent memory by default */
-- 
cgit v1.2.3


From e3654eca70d63704c94a60a2aafc0b3c7b46a00b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 16:17:07 -0700
Subject: nfit, libnvdimm: clarify "commands" vs "_DSMs"

Clarify the distinction between "commands", the ioctls userspace calls
to request the kernel take some action on a given dimm device, and
"_DSMs", the actual function numbers used in the firmware interface to
the DIMM.  _DSMs are ACPI specific whereas commands are Linux kernel
generic.

This is in preparation for breaking the 1:1 implicit relationship
between the kernel ioctl number space and the firmware specific function
numbers.

Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c              | 21 +++++++++++++--------
 drivers/acpi/nfit.h              |  4 ++--
 drivers/nvdimm/bus.c             |  8 ++++----
 drivers/nvdimm/core.c            |  2 +-
 drivers/nvdimm/dimm_devs.c       | 18 ++++++++++++------
 drivers/nvdimm/nd-core.h         |  2 +-
 include/linux/libnvdimm.h        |  5 +++--
 tools/testing/nvdimm/test/nfit.c | 27 ++++++++++++++-------------
 8 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index d0f35e63640b..1b98e9dc6138 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -175,7 +175,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	union acpi_object in_obj, in_buf, *out_obj;
 	struct device *dev = acpi_desc->dev;
 	const char *cmd_name, *dimm_name;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	acpi_handle handle;
 	const u8 *uuid;
 	u32 offset;
@@ -189,7 +189,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			return -ENOTTY;
 		dimm_name = nvdimm_name(nvdimm);
 		cmd_name = nvdimm_cmd_name(cmd);
-		dsm_mask = nfit_mem->dsm_mask;
+		cmd_mask = nvdimm_cmd_mask(nvdimm);
 		desc = nd_cmd_dimm_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_DIMM);
 		handle = adev->handle;
@@ -197,7 +197,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct acpi_device *adev = to_acpi_dev(acpi_desc);
 
 		cmd_name = nvdimm_bus_cmd_name(cmd);
-		dsm_mask = nd_desc->dsm_mask;
+		cmd_mask = nd_desc->cmd_mask;
 		desc = nd_cmd_bus_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_BUS);
 		handle = adev->handle;
@@ -207,7 +207,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
 		return -ENOTTY;
 
-	if (!test_bit(cmd, &dsm_mask))
+	if (!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	in_obj.type = ACPI_TYPE_PACKAGE;
@@ -926,7 +926,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
 	int i;
 
-	nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
+	/* nfit test assumes 1:1 relationship between commands and dsms */
+	nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return 0;
@@ -976,9 +977,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (rc)
 			continue;
 
+		/*
+		 * For now there is 1:1 relationship between cmd_mask and
+		 * dsm_mask.
+		 */
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
 				acpi_nfit_dimm_attribute_groups,
-				flags, &nfit_mem->dsm_mask);
+				flags, nfit_mem->dsm_mask);
 		if (!nvdimm)
 			return -ENOMEM;
 
@@ -1007,14 +1012,14 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	struct acpi_device *adev;
 	int i;
 
-	nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en;
+	nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return;
 
 	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
-			set_bit(i, &nd_desc->dsm_mask);
+			set_bit(i, &nd_desc->cmd_mask);
 }
 
 static ssize_t range_index_show(struct device *dev,
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c75576b2d50e..332ee6f01662 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -132,8 +132,8 @@ struct acpi_nfit_desc {
 	size_t ars_status_size;
 	struct work_struct work;
 	unsigned int cancel:1;
-	unsigned long dimm_dsm_force_en;
-	unsigned long bus_dsm_force_en;
+	unsigned long dimm_cmd_force_en;
+	unsigned long bus_cmd_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 19f822d7f652..cb2042a12b76 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -589,24 +589,24 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	void __user *p = (void __user *) arg;
 	struct device *dev = &nvdimm_bus->dev;
 	const char *cmd_name, *dimm_name;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	void *buf;
 	int rc, i;
 
 	if (nvdimm) {
 		desc = nd_cmd_dimm_desc(cmd);
 		cmd_name = nvdimm_cmd_name(cmd);
-		dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0;
+		cmd_mask = nvdimm->cmd_mask;
 		dimm_name = dev_name(&nvdimm->dev);
 	} else {
 		desc = nd_cmd_bus_desc(cmd);
 		cmd_name = nvdimm_bus_cmd_name(cmd);
-		dsm_mask = nd_desc->dsm_mask;
+		cmd_mask = nd_desc->cmd_mask;
 		dimm_name = "bus";
 	}
 
 	if (!desc || (desc->out_num + desc->in_num == 0) ||
-			!test_bit(cmd, &dsm_mask))
+			!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	/* fail write commands (when read-only) */
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 182a93fe3712..e8688a13cf4f 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -251,7 +251,7 @@ static ssize_t commands_show(struct device *dev,
 	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
 
-	for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG)
+	for_each_set_bit(cmd, &nd_desc->cmd_mask, BITS_PER_LONG)
 		len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
 	len += sprintf(buf + len, "\n");
 	return len;
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index c56f88217924..79a35a02053c 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -37,9 +37,9 @@ static int __validate_dimm(struct nvdimm_drvdata *ndd)
 
 	nvdimm = to_nvdimm(ndd->dev);
 
-	if (!nvdimm->dsm_mask)
+	if (!nvdimm->cmd_mask)
 		return -ENXIO;
-	if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask))
+	if (!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask))
 		return -ENXIO;
 
 	return 0;
@@ -263,6 +263,12 @@ const char *nvdimm_name(struct nvdimm *nvdimm)
 }
 EXPORT_SYMBOL_GPL(nvdimm_name);
 
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm)
+{
+	return nvdimm->cmd_mask;
+}
+EXPORT_SYMBOL_GPL(nvdimm_cmd_mask);
+
 void *nvdimm_provider_data(struct nvdimm *nvdimm)
 {
 	if (nvdimm)
@@ -277,10 +283,10 @@ static ssize_t commands_show(struct device *dev,
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 	int cmd, len = 0;
 
-	if (!nvdimm->dsm_mask)
+	if (!nvdimm->cmd_mask)
 		return sprintf(buf, "\n");
 
-	for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG)
+	for_each_set_bit(cmd, &nvdimm->cmd_mask, BITS_PER_LONG)
 		len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
 	len += sprintf(buf + len, "\n");
 	return len;
@@ -340,7 +346,7 @@ EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
 
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long *dsm_mask)
+		unsigned long cmd_mask)
 {
 	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
 	struct device *dev;
@@ -355,7 +361,7 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 	}
 	nvdimm->provider_data = provider_data;
 	nvdimm->flags = flags;
-	nvdimm->dsm_mask = dsm_mask;
+	nvdimm->cmd_mask = cmd_mask;
 	atomic_set(&nvdimm->busy, 0);
 	dev = &nvdimm->dev;
 	dev_set_name(dev, "nmem%d", nvdimm->id);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 1d1500f3d8b5..da0d322ed7cb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -37,7 +37,7 @@ struct nvdimm_bus {
 struct nvdimm {
 	unsigned long flags;
 	void *provider_data;
-	unsigned long *dsm_mask;
+	unsigned long cmd_mask;
 	struct device dev;
 	atomic_t busy;
 	int id;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index af31d1c6fdd7..0c3c30cbbea5 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -68,7 +68,7 @@ struct nd_mapping {
 
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	char *provider_name;
 	ndctl_fn ndctl;
 	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
@@ -130,10 +130,11 @@ struct nd_region *to_nd_region(struct device *dev);
 struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long *dsm_mask);
+		unsigned long cmd_mask);
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
 u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 3187322eeed7..ed899a411c22 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -344,8 +344,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 
 	if (nvdimm) {
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+		unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
 
-		if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+		if (!nfit_mem || !test_bit(cmd, &cmd_mask))
 			return -ENOTTY;
 
 		/* lookup label space for the given dimm */
@@ -374,7 +375,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	} else {
 		struct ars_state *ars_state = &t->ars_state;
 
-		if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
+		if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
 			return -ENOTTY;
 
 		switch (cmd) {
@@ -1251,13 +1252,13 @@ static void nfit_test0_setup(struct nfit_test *t)
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
 
 	acpi_desc = &t->acpi_desc;
-	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1315,10 +1316,10 @@ static void nfit_test1_setup(struct nfit_test *t)
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
 
 	acpi_desc = &t->acpi_desc;
-	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-- 
cgit v1.2.3


From fa66ddb870ca022342fe6d1312ef76d2f7233a1d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Apr 2016 12:04:13 -0400
Subject: tracing: Move trace_buffer_unlock_commit{_regs}() to local header

The functions trace_buffer_unlock_commit() and the _regs() version are only
used within the kernel/trace directory. Move them to the local header and
remove the export as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  9 ---------
 kernel/trace/trace.c         |  2 --
 kernel/trace/trace.h         | 10 ++++++++++
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index bb383af35cc7..48cc5e19c5f5 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -158,15 +158,6 @@ struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc);
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc);
-void trace_buffer_unlock_commit_regs(struct trace_array *tr,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
-				     struct pt_regs *regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ba54e241c8d..94e7e4d11b79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1696,7 +1696,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
 static struct ring_buffer *temp_buffer;
 
@@ -1748,7 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
 
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0862e7559548..bd5ae56dec7a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1068,6 +1068,16 @@ struct trace_subsystem_dir {
 extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event,
+				     unsigned long flags, int pc,
+				     struct pt_regs *regs);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From a9fe48dcde88fd48e210e4280f19cb9300ec9112 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 16:12:39 -0400
Subject: tracing: Remove unused function trace_current_buffer_discard_commit()

The function trace_current_buffer_discard_commit() has no callers, remove
it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 2 --
 kernel/trace/trace.c         | 8 --------
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 48cc5e19c5f5..356c39b3abbb 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -158,8 +158,6 @@ struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 94e7e4d11b79..e5bdb9accf52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1734,7 +1734,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
@@ -1748,13 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event)
-{
-	ring_buffer_discard_commit(buffer, event);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
-
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-- 
cgit v1.2.3


From 904d1857ad09b43f514897dd42daffe200d1ca50 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 18:11:54 -0400
Subject: tracing: Remove unused function trace_current_buffer_lock_reserve()

trace_current_buffer_lock_reserve() has no more users. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  4 ----
 kernel/trace/trace.c         | 10 ----------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 356c39b3abbb..3111a1efdad6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -154,10 +154,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				struct trace_event_file *trace_file,
 				int type, unsigned long len,
 				unsigned long flags, int pc);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 41bf14412666..c09e8ffadc73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1714,16 +1714,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc)
-{
-	*current_rb = global_trace.trace_buffer.buffer;
-	return trace_buffer_lock_reserve(*current_rb,
-					 type, len, flags, pc);
-}
-
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
-- 
cgit v1.2.3


From 3ed605bc8a0a688d8750a1e2eff39c854418c5cf Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Date: Tue, 26 Apr 2016 12:32:27 -0300
Subject: kernel.h: add u64_to_user_ptr()

This function had copies in 3 different files. Unify them in kernel.h.

Cc: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Rob Clark <robdclark@gmail.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Acked-by: Daniel Vetter <daniel.vetter@intel.com>	[drm/i915/]
Acked-by: Rob Clark <robdclark@gmail.com>		[drm/msm/]
Acked-by: Lucas Stach <l.stach@pengutronix.de>		[drm/etinav/]
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 11 +++--------
 drivers/gpu/drm/i915/i915_drv.h              |  5 -----
 drivers/gpu/drm/i915/i915_gem.c              | 14 +++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 14 +++++++-------
 drivers/gpu/drm/msm/msm_gem_submit.c         | 11 +++--------
 include/linux/kernel.h                       |  7 +++++++
 6 files changed, 27 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 236ada93df53..afdd55ddf821 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -28,11 +28,6 @@
 #define BO_LOCKED   0x4000
 #define BO_PINNED   0x2000
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static struct etnaviv_gem_submit *submit_create(struct drm_device *dev,
 		struct etnaviv_gpu *gpu, size_t nr)
 {
@@ -347,21 +342,21 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 	cmdbuf->exec_state = args->exec_state;
 	cmdbuf->ctx = file->driver_priv;
 
-	ret = copy_from_user(bos, to_user_ptr(args->bos),
+	ret = copy_from_user(bos, u64_to_user_ptr(args->bos),
 			     args->nr_bos * sizeof(*bos));
 	if (ret) {
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
 
-	ret = copy_from_user(relocs, to_user_ptr(args->relocs),
+	ret = copy_from_user(relocs, u64_to_user_ptr(args->relocs),
 			     args->nr_relocs * sizeof(*relocs));
 	if (ret) {
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
 
-	ret = copy_from_user(stream, to_user_ptr(args->stream),
+	ret = copy_from_user(stream, u64_to_user_ptr(args->stream),
 			     args->stream_size);
 	if (ret) {
 		ret = -EFAULT;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 10480939159c..bb624ccabcb4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3576,11 +3576,6 @@ static inline i915_reg_t i915_vgacntrl_reg(struct drm_device *dev)
 		return VGACNTRL;
 }
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static inline unsigned long msecs_to_jiffies_timeout(const unsigned int m)
 {
 	unsigned long j = msecs_to_jiffies(m);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dabc08987b5e..288971670759 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -324,7 +324,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 {
 	struct drm_device *dev = obj->base.dev;
 	void *vaddr = obj->phys_handle->vaddr + args->offset;
-	char __user *user_data = to_user_ptr(args->data_ptr);
+	char __user *user_data = u64_to_user_ptr(args->data_ptr);
 	int ret = 0;
 
 	/* We manually control the domain here and pretend that it
@@ -605,7 +605,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int needs_clflush = 0;
 	struct sg_page_iter sg_iter;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
@@ -692,7 +692,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	if (!access_ok(VERIFY_WRITE,
-		       to_user_ptr(args->data_ptr),
+		       u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
@@ -783,7 +783,7 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
 	if (ret)
 		goto out_unpin;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	offset = i915_gem_obj_ggtt_offset(obj) + args->offset;
@@ -907,7 +907,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int needs_clflush_before = 0;
 	struct sg_page_iter sg_iter;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
@@ -1036,12 +1036,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	if (!access_ok(VERIFY_READ,
-		       to_user_ptr(args->data_ptr),
+		       u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
 	if (likely(!i915.prefault_disable)) {
-		ret = fault_in_multipages_readable(to_user_ptr(args->data_ptr),
+		ret = fault_in_multipages_readable(u64_to_user_ptr(args->data_ptr),
 						   args->size);
 		if (ret)
 			return -EFAULT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1328bc5021b4..e60b4e72a5e4 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -514,7 +514,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	int remain, ret;
 
-	user_relocs = to_user_ptr(entry->relocs_ptr);
+	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -865,7 +865,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		u64 invalid_offset = (u64)-1;
 		int j;
 
-		user_relocs = to_user_ptr(exec[i].relocs_ptr);
+		user_relocs = u64_to_user_ptr(exec[i].relocs_ptr);
 
 		if (copy_from_user(reloc+total, user_relocs,
 				   exec[i].relocation_count * sizeof(*reloc))) {
@@ -1009,7 +1009,7 @@ validate_exec_list(struct drm_device *dev,
 		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 
 	for (i = 0; i < count; i++) {
-		char __user *ptr = to_user_ptr(exec[i].relocs_ptr);
+		char __user *ptr = u64_to_user_ptr(exec[i].relocs_ptr);
 		int length; /* limited by fault_in_pages_readable() */
 
 		if (exec[i].flags & invalid_flags)
@@ -1696,7 +1696,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 	ret = copy_from_user(exec_list,
-			     to_user_ptr(args->buffers_ptr),
+			     u64_to_user_ptr(args->buffers_ptr),
 			     sizeof(*exec_list) * args->buffer_count);
 	if (ret != 0) {
 		DRM_DEBUG("copy %d exec entries failed %d\n",
@@ -1732,7 +1732,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	ret = i915_gem_do_execbuffer(dev, data, file, &exec2, exec2_list);
 	if (!ret) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
-			to_user_ptr(args->buffers_ptr);
+			u64_to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
@@ -1786,7 +1786,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 	ret = copy_from_user(exec2_list,
-			     to_user_ptr(args->buffers_ptr),
+			     u64_to_user_ptr(args->buffers_ptr),
 			     sizeof(*exec2_list) * args->buffer_count);
 	if (ret != 0) {
 		DRM_DEBUG("copy %d exec entries failed %d\n",
@@ -1799,7 +1799,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 	if (!ret) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   to_user_ptr(args->buffers_ptr);
+				   u64_to_user_ptr(args->buffers_ptr);
 		int i;
 
 		for (i = 0; i < args->buffer_count; i++) {
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 43d2181231c0..23d25283616c 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -28,11 +28,6 @@
 #define BO_LOCKED   0x4000
 #define BO_PINNED   0x2000
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
 		struct msm_gpu *gpu, int nr)
 {
@@ -68,7 +63,7 @@ static int submit_lookup_objects(struct msm_gem_submit *submit,
 		struct drm_gem_object *obj;
 		struct msm_gem_object *msm_obj;
 		void __user *userptr =
-			to_user_ptr(args->bos + (i * sizeof(submit_bo)));
+			u64_to_user_ptr(args->bos + (i * sizeof(submit_bo)));
 
 		ret = copy_from_user(&submit_bo, userptr, sizeof(submit_bo));
 		if (ret) {
@@ -257,7 +252,7 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob
 	for (i = 0; i < nr_relocs; i++) {
 		struct drm_msm_gem_submit_reloc submit_reloc;
 		void __user *userptr =
-			to_user_ptr(relocs + (i * sizeof(submit_reloc)));
+			u64_to_user_ptr(relocs + (i * sizeof(submit_reloc)));
 		uint32_t iova, off;
 		bool valid;
 
@@ -356,7 +351,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	for (i = 0; i < args->nr_cmds; i++) {
 		struct drm_msm_gem_submit_cmd submit_cmd;
 		void __user *userptr =
-			to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
+			u64_to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
 		struct msm_gem_object *msm_obj;
 		uint32_t iova;
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f7775e229b0..f3e45cb0b6a2 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -53,6 +53,13 @@
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+#define u64_to_user_ptr(x) (		\
+{					\
+	typecheck(u64, x);		\
+	(void __user *)(uintptr_t)x;	\
+}					\
+)
+
 /*
  * This looks more complex than it should be. But we need to
  * get the type for the ~ right in round_down (it needs to be
-- 
cgit v1.2.3


From 460bfc41fd52959311ed0328163f785e023857af Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Date: Thu, 28 Apr 2016 10:46:57 -0300
Subject: dma-buf/sync_file: de-stage sync_file headers

Move sync_file headers file to include/ dir.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/sync.h           |   4 +-
 drivers/staging/android/sync_debug.c     |   2 +-
 drivers/staging/android/sync_file.c      |   4 +-
 drivers/staging/android/sync_file.h      |  57 ------------------
 drivers/staging/android/uapi/sync_file.h | 100 -------------------------------
 include/linux/sync_file.h                |  57 ++++++++++++++++++
 include/uapi/linux/sync_file.h           | 100 +++++++++++++++++++++++++++++++
 7 files changed, 162 insertions(+), 162 deletions(-)
 delete mode 100644 drivers/staging/android/sync_file.h
 delete mode 100644 drivers/staging/android/uapi/sync_file.h
 create mode 100644 include/linux/sync_file.h
 create mode 100644 include/uapi/linux/sync_file.h

(limited to 'include/linux')

diff --git a/drivers/staging/android/sync.h b/drivers/staging/android/sync.h
index df44abb95963..b56885c14839 100644
--- a/drivers/staging/android/sync.h
+++ b/drivers/staging/android/sync.h
@@ -20,8 +20,8 @@
 #include <linux/spinlock.h>
 #include <linux/fence.h>
 
-#include "sync_file.h"
-#include "uapi/sync_file.h"
+#include <linux/sync_file.h>
+#include <uapi/linux/sync_file.h>
 
 struct sync_timeline;
 
diff --git a/drivers/staging/android/sync_debug.c b/drivers/staging/android/sync_debug.c
index 8b55218f5535..5f57499c98bf 100644
--- a/drivers/staging/android/sync_debug.c
+++ b/drivers/staging/android/sync_debug.c
@@ -26,7 +26,7 @@
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
 #include <linux/time64.h>
-#include "sync_file.h"
+#include <linux/sync_file.h>
 #include "sw_sync.h"
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/staging/android/sync_file.c b/drivers/staging/android/sync_file.c
index eabf90dd63b3..f08cf2d8309e 100644
--- a/drivers/staging/android/sync_file.c
+++ b/drivers/staging/android/sync_file.c
@@ -23,8 +23,8 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
-#include "sync_file.h"
-#include "uapi/sync_file.h"
+#include <linux/sync_file.h>
+#include <uapi/linux/sync_file.h>
 
 static const struct file_operations sync_file_fops;
 
diff --git a/drivers/staging/android/sync_file.h b/drivers/staging/android/sync_file.h
deleted file mode 100644
index c6ffe8b0725c..000000000000
--- a/drivers/staging/android/sync_file.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * include/linux/sync_file.h
- *
- * Copyright (C) 2012 Google, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _LINUX_SYNC_FILE_H
-#define _LINUX_SYNC_FILE_H
-
-#include <linux/types.h>
-#include <linux/kref.h>
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/fence.h>
-
-struct sync_file_cb {
-	struct fence_cb cb;
-	struct fence *fence;
-	struct sync_file *sync_file;
-};
-
-/**
- * struct sync_file - sync file to export to the userspace
- * @file:		file representing this fence
- * @kref:		reference count on fence.
- * @name:		name of sync_file.  Useful for debugging
- * @sync_file_list:	membership in global file list
- * @num_fences:		number of sync_pts in the fence
- * @wq:			wait queue for fence signaling
- * @status:		0: signaled, >0:active, <0: error
- * @cbs:		sync_pts callback information
- */
-struct sync_file {
-	struct file		*file;
-	struct kref		kref;
-	char			name[32];
-#ifdef CONFIG_DEBUG_FS
-	struct list_head	sync_file_list;
-#endif
-	int num_fences;
-
-	wait_queue_head_t	wq;
-	atomic_t		status;
-
-	struct sync_file_cb	cbs[];
-};
-
-struct sync_file *sync_file_create(struct fence *fence);
-
-#endif /* _LINUX_SYNC_H */
diff --git a/drivers/staging/android/uapi/sync_file.h b/drivers/staging/android/uapi/sync_file.h
deleted file mode 100644
index 413303d37b56..000000000000
--- a/drivers/staging/android/uapi/sync_file.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2012 Google, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _UAPI_LINUX_SYNC_H
-#define _UAPI_LINUX_SYNC_H
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-/**
- * struct sync_merge_data - data passed to merge ioctl
- * @name:	name of new fence
- * @fd2:	file descriptor of second fence
- * @fence:	returns the fd of the new fence to userspace
- * @flags:	merge_data flags
- * @pad:	padding for 64-bit alignment, should always be zero
- */
-struct sync_merge_data {
-	char	name[32];
-	__s32	fd2;
-	__s32	fence;
-	__u32	flags;
-	__u32	pad;
-};
-
-/**
- * struct sync_fence_info - detailed fence information
- * @obj_name:		name of parent sync_timeline
-* @driver_name:	name of driver implementing the parent
-* @status:		status of the fence 0:active 1:signaled <0:error
- * @flags:		fence_info flags
- * @timestamp_ns:	timestamp of status change in nanoseconds
- */
-struct sync_fence_info {
-	char	obj_name[32];
-	char	driver_name[32];
-	__s32	status;
-	__u32	flags;
-	__u64	timestamp_ns;
-};
-
-/**
- * struct sync_file_info - data returned from fence info ioctl
- * @name:	name of fence
- * @status:	status of fence. 1: signaled 0:active <0:error
- * @flags:	sync_file_info flags
- * @num_fences	number of fences in the sync_file
- * @pad:	padding for 64-bit alignment, should always be zero
- * @sync_fence_info: pointer to array of structs sync_fence_info with all
- *		 fences in the sync_file
- */
-struct sync_file_info {
-	char	name[32];
-	__s32	status;
-	__u32	flags;
-	__u32	num_fences;
-	__u32	pad;
-
-	__u64	sync_fence_info;
-};
-
-#define SYNC_IOC_MAGIC		'>'
-
-/**
- * Opcodes  0, 1 and 2 were burned during a API change to avoid users of the
- * old API to get weird errors when trying to handling sync_files. The API
- * change happened during the de-stage of the Sync Framework when there was
- * no upstream users available.
- */
-
-/**
- * DOC: SYNC_IOC_MERGE - merge two fences
- *
- * Takes a struct sync_merge_data.  Creates a new fence containing copies of
- * the sync_pts in both the calling fd and sync_merge_data.fd2.  Returns the
- * new fence's fd in sync_merge_data.fence
- */
-#define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
-
-/**
- * DOC: SYNC_IOC_FENCE_INFO - get detailed information on a fence
- *
- * Takes a struct sync_file_info_data with extra space allocated for pt_info.
- * Caller should write the size of the buffer into len.  On return, len is
- * updated to reflect the total size of the sync_file_info_data including
- * pt_info.
- *
- * pt_info is a buffer containing sync_pt_infos for every sync_pt in the fence.
- * To iterate over the sync_pt_infos, use the sync_pt_info.len field.
- */
-#define SYNC_IOC_FILE_INFO	_IOWR(SYNC_IOC_MAGIC, 4, struct sync_file_info)
-
-#endif /* _UAPI_LINUX_SYNC_H */
diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h
new file mode 100644
index 000000000000..c6ffe8b0725c
--- /dev/null
+++ b/include/linux/sync_file.h
@@ -0,0 +1,57 @@
+/*
+ * include/linux/sync_file.h
+ *
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_SYNC_FILE_H
+#define _LINUX_SYNC_FILE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/fence.h>
+
+struct sync_file_cb {
+	struct fence_cb cb;
+	struct fence *fence;
+	struct sync_file *sync_file;
+};
+
+/**
+ * struct sync_file - sync file to export to the userspace
+ * @file:		file representing this fence
+ * @kref:		reference count on fence.
+ * @name:		name of sync_file.  Useful for debugging
+ * @sync_file_list:	membership in global file list
+ * @num_fences:		number of sync_pts in the fence
+ * @wq:			wait queue for fence signaling
+ * @status:		0: signaled, >0:active, <0: error
+ * @cbs:		sync_pts callback information
+ */
+struct sync_file {
+	struct file		*file;
+	struct kref		kref;
+	char			name[32];
+#ifdef CONFIG_DEBUG_FS
+	struct list_head	sync_file_list;
+#endif
+	int num_fences;
+
+	wait_queue_head_t	wq;
+	atomic_t		status;
+
+	struct sync_file_cb	cbs[];
+};
+
+struct sync_file *sync_file_create(struct fence *fence);
+
+#endif /* _LINUX_SYNC_H */
diff --git a/include/uapi/linux/sync_file.h b/include/uapi/linux/sync_file.h
new file mode 100644
index 000000000000..413303d37b56
--- /dev/null
+++ b/include/uapi/linux/sync_file.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_SYNC_H
+#define _UAPI_LINUX_SYNC_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * struct sync_merge_data - data passed to merge ioctl
+ * @name:	name of new fence
+ * @fd2:	file descriptor of second fence
+ * @fence:	returns the fd of the new fence to userspace
+ * @flags:	merge_data flags
+ * @pad:	padding for 64-bit alignment, should always be zero
+ */
+struct sync_merge_data {
+	char	name[32];
+	__s32	fd2;
+	__s32	fence;
+	__u32	flags;
+	__u32	pad;
+};
+
+/**
+ * struct sync_fence_info - detailed fence information
+ * @obj_name:		name of parent sync_timeline
+* @driver_name:	name of driver implementing the parent
+* @status:		status of the fence 0:active 1:signaled <0:error
+ * @flags:		fence_info flags
+ * @timestamp_ns:	timestamp of status change in nanoseconds
+ */
+struct sync_fence_info {
+	char	obj_name[32];
+	char	driver_name[32];
+	__s32	status;
+	__u32	flags;
+	__u64	timestamp_ns;
+};
+
+/**
+ * struct sync_file_info - data returned from fence info ioctl
+ * @name:	name of fence
+ * @status:	status of fence. 1: signaled 0:active <0:error
+ * @flags:	sync_file_info flags
+ * @num_fences	number of fences in the sync_file
+ * @pad:	padding for 64-bit alignment, should always be zero
+ * @sync_fence_info: pointer to array of structs sync_fence_info with all
+ *		 fences in the sync_file
+ */
+struct sync_file_info {
+	char	name[32];
+	__s32	status;
+	__u32	flags;
+	__u32	num_fences;
+	__u32	pad;
+
+	__u64	sync_fence_info;
+};
+
+#define SYNC_IOC_MAGIC		'>'
+
+/**
+ * Opcodes  0, 1 and 2 were burned during a API change to avoid users of the
+ * old API to get weird errors when trying to handling sync_files. The API
+ * change happened during the de-stage of the Sync Framework when there was
+ * no upstream users available.
+ */
+
+/**
+ * DOC: SYNC_IOC_MERGE - merge two fences
+ *
+ * Takes a struct sync_merge_data.  Creates a new fence containing copies of
+ * the sync_pts in both the calling fd and sync_merge_data.fd2.  Returns the
+ * new fence's fd in sync_merge_data.fence
+ */
+#define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
+
+/**
+ * DOC: SYNC_IOC_FENCE_INFO - get detailed information on a fence
+ *
+ * Takes a struct sync_file_info_data with extra space allocated for pt_info.
+ * Caller should write the size of the buffer into len.  On return, len is
+ * updated to reflect the total size of the sync_file_info_data including
+ * pt_info.
+ *
+ * pt_info is a buffer containing sync_pt_infos for every sync_pt in the fence.
+ * To iterate over the sync_pt_infos, use the sync_pt_info.len field.
+ */
+#define SYNC_IOC_FILE_INFO	_IOWR(SYNC_IOC_MAGIC, 4, struct sync_file_info)
+
+#endif /* _UAPI_LINUX_SYNC_H */
-- 
cgit v1.2.3


From d3feb406733544dbf0e239ef945a09decdceac88 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Thu, 14 Apr 2016 11:37:43 +0200
Subject: phy: bcm-ns-usb2: new driver for USB 2.0 PHY on Northstar
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Northstar is a family of SoCs used in home routers. They have USB 2.0
and 3.0 controllers with PHYs that need to be properly initialized.
This driver provides PHY init support in a generic way and can be bound
with an EHCI controller driver.
There are (just a few) registers being defined in bcma header. It's
because DMU/CRU registers will be also needed in other drivers. We will
need them e.g. in PCIe controller/PHY driver and at some point probably
in clock driver for BCM53573 chipset. By using include/linux/bcma/ we
avoid code duplication.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/bcm-ns-usb2-phy.txt    |  21 ++++
 drivers/phy/Kconfig                                |   9 ++
 drivers/phy/Makefile                               |   1 +
 drivers/phy/phy-bcm-ns-usb2.c                      | 137 +++++++++++++++++++++
 include/linux/bcma/bcma.h                          |   1 +
 include/linux/bcma/bcma_driver_arm_c9.h            |  15 +++
 6 files changed, 184 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt
 create mode 100644 drivers/phy/phy-bcm-ns-usb2.c
 create mode 100644 include/linux/bcma/bcma_driver_arm_c9.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt b/Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt
new file mode 100644
index 000000000000..a7aee9ea8926
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt
@@ -0,0 +1,21 @@
+Driver for Broadcom Northstar USB 2.0 PHY
+
+Required properties:
+- compatible: brcm,ns-usb2-phy
+- reg: iomem address range of DMU (Device Management Unit)
+- reg-names: "dmu", the only needed & supported reg right now
+- clocks: USB PHY reference clock
+- clock-names: "phy-ref-clk", the only needed & supported clock right now
+
+To initialize USB 2.0 PHY driver needs to setup PLL correctly. To do this it
+requires passing phandle to the USB PHY reference clock.
+
+Example:
+	usb2-phy {
+		compatible = "brcm,ns-usb2-phy";
+		reg = <0x1800c000 0x1000>;
+		reg-names = "dmu";
+		#phy-cells = <0>;
+		clocks = <&genpll BCM_NSP_GENPLL_USB_PHY_REF_CLK>;
+		clock-names = "phy-ref-clk";
+	};
diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index f6ff76ec89dc..f2b458fc0c89 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -15,6 +15,15 @@ config GENERIC_PHY
 	  phy users can obtain reference to the PHY. All the users of this
 	  framework should select this config.
 
+config PHY_BCM_NS_USB2
+	tristate "Broadcom Northstar USB 2.0 PHY Driver"
+	depends on ARCH_BCM_IPROC || COMPILE_TEST
+	depends on HAS_IOMEM && OF
+	select GENERIC_PHY
+	help
+	  Enable this to support Broadcom USB 2.0 PHY connected to the USB
+	  controller on Northstar family.
+
 config PHY_BERLIN_USB
 	tristate "Marvell Berlin USB PHY Driver"
 	depends on ARCH_BERLIN && RESET_CONTROLLER && HAS_IOMEM && OF
diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile
index f03fa1fdf322..0de09e13fdbc 100644
--- a/drivers/phy/Makefile
+++ b/drivers/phy/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_GENERIC_PHY)		+= phy-core.o
+obj-$(CONFIG_PHY_BCM_NS_USB2)		+= phy-bcm-ns-usb2.o
 obj-$(CONFIG_PHY_BERLIN_USB)		+= phy-berlin-usb.o
 obj-$(CONFIG_PHY_BERLIN_SATA)		+= phy-berlin-sata.o
 obj-$(CONFIG_PHY_DM816X_USB)		+= phy-dm816x-usb.o
diff --git a/drivers/phy/phy-bcm-ns-usb2.c b/drivers/phy/phy-bcm-ns-usb2.c
new file mode 100644
index 000000000000..95ab6b2a0de5
--- /dev/null
+++ b/drivers/phy/phy-bcm-ns-usb2.c
@@ -0,0 +1,137 @@
+/*
+ * Broadcom Northstar USB 2.0 PHY Driver
+ *
+ * Copyright (C) 2016 Rafał Miłecki <zajec5@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bcma/bcma.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct bcm_ns_usb2 {
+	struct device *dev;
+	struct clk *ref_clk;
+	struct phy *phy;
+	void __iomem *dmu;
+};
+
+static int bcm_ns_usb2_phy_init(struct phy *phy)
+{
+	struct bcm_ns_usb2 *usb2 = phy_get_drvdata(phy);
+	struct device *dev = usb2->dev;
+	void __iomem *dmu = usb2->dmu;
+	u32 ref_clk_rate, usb2ctl, usb_pll_ndiv, usb_pll_pdiv;
+	int err = 0;
+
+	err = clk_prepare_enable(usb2->ref_clk);
+	if (err < 0) {
+		dev_err(dev, "Failed to prepare ref clock: %d\n", err);
+		goto err_out;
+	}
+
+	ref_clk_rate = clk_get_rate(usb2->ref_clk);
+	if (!ref_clk_rate) {
+		dev_err(dev, "Failed to get ref clock rate\n");
+		err = -EINVAL;
+		goto err_clk_off;
+	}
+
+	usb2ctl = readl(dmu + BCMA_DMU_CRU_USB2_CONTROL);
+
+	if (usb2ctl & BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_MASK) {
+		usb_pll_pdiv = usb2ctl;
+		usb_pll_pdiv &= BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_MASK;
+		usb_pll_pdiv >>= BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_SHIFT;
+	} else {
+		usb_pll_pdiv = 1 << 3;
+	}
+
+	/* Calculate ndiv based on a solid 1920 MHz that is for USB2 PHY */
+	usb_pll_ndiv = (1920000000 * usb_pll_pdiv) / ref_clk_rate;
+
+	/* Unlock DMU PLL settings with some magic value */
+	writel(0x0000ea68, dmu + BCMA_DMU_CRU_CLKSET_KEY);
+
+	/* Write USB 2.0 PLL control setting */
+	usb2ctl &= ~BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_MASK;
+	usb2ctl |= usb_pll_ndiv << BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_SHIFT;
+	writel(usb2ctl, dmu + BCMA_DMU_CRU_USB2_CONTROL);
+
+	/* Lock DMU PLL settings */
+	writel(0x00000000, dmu + BCMA_DMU_CRU_CLKSET_KEY);
+
+err_clk_off:
+	clk_disable_unprepare(usb2->ref_clk);
+err_out:
+	return err;
+}
+
+static const struct phy_ops ops = {
+	.init		= bcm_ns_usb2_phy_init,
+	.owner		= THIS_MODULE,
+};
+
+static int bcm_ns_usb2_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct bcm_ns_usb2 *usb2;
+	struct resource *res;
+	struct phy_provider *phy_provider;
+
+	usb2 = devm_kzalloc(&pdev->dev, sizeof(*usb2), GFP_KERNEL);
+	if (!usb2)
+		return -ENOMEM;
+	usb2->dev = dev;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dmu");
+	usb2->dmu = devm_ioremap_resource(dev, res);
+	if (IS_ERR(usb2->dmu)) {
+		dev_err(dev, "Failed to map DMU regs\n");
+		return PTR_ERR(usb2->dmu);
+	}
+
+	usb2->ref_clk = devm_clk_get(dev, "phy-ref-clk");
+	if (IS_ERR(usb2->ref_clk)) {
+		dev_err(dev, "Clock not defined\n");
+		return PTR_ERR(usb2->ref_clk);
+	}
+
+	usb2->phy = devm_phy_create(dev, NULL, &ops);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	phy_set_drvdata(usb2->phy, usb2);
+	platform_set_drvdata(pdev, usb2);
+
+	phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
+	return PTR_ERR_OR_ZERO(phy_provider);
+}
+
+static const struct of_device_id bcm_ns_usb2_id_table[] = {
+	{ .compatible = "brcm,ns-usb2-phy", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, bcm_ns_usb2_id_table);
+
+static struct platform_driver bcm_ns_usb2_driver = {
+	.probe		= bcm_ns_usb2_probe,
+	.driver = {
+		.name = "bcm_ns_usb2",
+		.of_match_table = bcm_ns_usb2_id_table,
+	},
+};
+module_platform_driver(bcm_ns_usb2_driver);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 0367c63f5960..e6b41f42602b 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -4,6 +4,7 @@
 #include <linux/pci.h>
 #include <linux/mod_devicetable.h>
 
+#include <linux/bcma/bcma_driver_arm_c9.h>
 #include <linux/bcma/bcma_driver_chipcommon.h>
 #include <linux/bcma/bcma_driver_pci.h>
 #include <linux/bcma/bcma_driver_pcie2.h>
diff --git a/include/linux/bcma/bcma_driver_arm_c9.h b/include/linux/bcma/bcma_driver_arm_c9.h
new file mode 100644
index 000000000000..93bd73d670d5
--- /dev/null
+++ b/include/linux/bcma/bcma_driver_arm_c9.h
@@ -0,0 +1,15 @@
+#ifndef LINUX_BCMA_DRIVER_ARM_C9_H_
+#define LINUX_BCMA_DRIVER_ARM_C9_H_
+
+/* DMU (Device Management Unit) */
+#define BCMA_DMU_CRU_USB2_CONTROL			0x0164
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_MASK	0x00000FFC
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_SHIFT	2
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_MASK	0x00007000
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_SHIFT	12
+#define BCMA_DMU_CRU_CLKSET_KEY				0x0180
+#define BCMA_DMU_CRU_STRAPS_CTRL			0x02A0
+#define  BCMA_DMU_CRU_STRAPS_CTRL_USB3			0x00000010
+#define  BCMA_DMU_CRU_STRAPS_CTRL_4BYTE			0x00008000
+
+#endif /* LINUX_BCMA_DRIVER_ARM_C9_H_ */
-- 
cgit v1.2.3


From 71f5c63c07e5be7abdce40891778ffbf3cec04f0 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 23 Mar 2016 12:09:18 +0100
Subject: phy: exynos-mipi-video: Add support for Exynos 5420 and 5433 SoCs

This patch adds support for MIPI DPHYs found in Exynos5420-compatible
(5420, 5422 and 5800) and Exynos5433 SoCs. Those SoCs differs from
earlier by different offset of MIPI DPHY registers in PMU controllers
(Exynos 5420-compatible case) or by moving MIPI DPHY reset registers to
separate system register controllers (Exynos 5433 case). In both case
also additional 5th PHY (MIPI CSIS 2) has been added.

Acked-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/samsung-phy.txt        |  18 ++-
 drivers/phy/phy-exynos-mipi-video.c                | 129 ++++++++++++++++++++-
 include/linux/mfd/syscon/exynos5-pmu.h             |   3 +
 3 files changed, 147 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/phy/samsung-phy.txt b/Documentation/devicetree/bindings/phy/samsung-phy.txt
index 0289d3b07853..9872ba8546bd 100644
--- a/Documentation/devicetree/bindings/phy/samsung-phy.txt
+++ b/Documentation/devicetree/bindings/phy/samsung-phy.txt
@@ -2,9 +2,20 @@ Samsung S5P/EXYNOS SoC series MIPI CSIS/DSIM DPHY
 -------------------------------------------------
 
 Required properties:
-- compatible : should be "samsung,s5pv210-mipi-video-phy";
+- compatible : should be one of the listed compatibles:
+	- "samsung,s5pv210-mipi-video-phy"
+	- "samsung,exynos5420-mipi-video-phy"
+	- "samsung,exynos5433-mipi-video-phy"
 - #phy-cells : from the generic phy bindings, must be 1;
-- syscon - phandle to the PMU system controller;
+
+In case of s5pv210 and exynos5420 compatible PHYs:
+- syscon - phandle to the PMU system controller
+
+In case of exynos5433 compatible PHY:
+ - samsung,pmu-syscon - phandle to the PMU system controller
+ - samsung,disp-sysreg - phandle to the DISP system registers controller
+ - samsung,cam0-sysreg - phandle to the CAM0 system registers controller
+ - samsung,cam1-sysreg - phandle to the CAM1 system registers controller
 
 For "samsung,s5pv210-mipi-video-phy" compatible PHYs the second cell in
 the PHY specifier identifies the PHY and its meaning is as follows:
@@ -12,6 +23,9 @@ the PHY specifier identifies the PHY and its meaning is as follows:
   1 - MIPI DSIM 0,
   2 - MIPI CSIS 1,
   3 - MIPI DSIM 1.
+"samsung,exynos5420-mipi-video-phy" and "samsung,exynos5433-mipi-video-phy"
+supports additional fifth PHY:
+  4 - MIPI CSIS 2.
 
 Samsung EXYNOS SoC series Display Port PHY
 -------------------------------------------------
diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index 3cb69e005f18..cc093ebfda94 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -1,7 +1,7 @@
 /*
  * Samsung S5P/EXYNOS SoC series MIPI CSIS/DSIM DPHY driver
  *
- * Copyright (C) 2013 Samsung Electronics Co., Ltd.
+ * Copyright (C) 2013,2016 Samsung Electronics Co., Ltd.
  * Author: Sylwester Nawrocki <s.nawrocki@samsung.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -13,6 +13,7 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/mfd/syscon/exynos4-pmu.h>
+#include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -28,11 +29,15 @@ enum exynos_mipi_phy_id {
 	EXYNOS_MIPI_PHY_ID_DSIM0,
 	EXYNOS_MIPI_PHY_ID_CSIS1,
 	EXYNOS_MIPI_PHY_ID_DSIM1,
+	EXYNOS_MIPI_PHY_ID_CSIS2,
 	EXYNOS_MIPI_PHYS_NUM
 };
 
 enum exynos_mipi_phy_regmap_id {
 	EXYNOS_MIPI_REGMAP_PMU,
+	EXYNOS_MIPI_REGMAP_DISP,
+	EXYNOS_MIPI_REGMAP_CAM0,
+	EXYNOS_MIPI_REGMAP_CAM1,
 	EXYNOS_MIPI_REGMAPS_NUM
 };
 
@@ -96,6 +101,122 @@ static const struct mipi_phy_device_desc s5pv210_mipi_phy = {
 	},
 };
 
+static const struct mipi_phy_device_desc exynos5420_mipi_phy = {
+	.num_regmaps = 1,
+	.regmap_names = {"syscon"},
+	.num_phys = 5,
+	.phys = {
+		{
+			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM1,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS1,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY2_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY2_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		},
+	},
+};
+
+#define EXYNOS5433_SYSREG_DISP_MIPI_PHY		0x100C
+#define EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON	0x1014
+#define EXYNOS5433_SYSREG_CAM1_MIPI_DPHY_CON	0x1020
+
+static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
+	.num_regmaps = 4,
+	.regmap_names = {
+		"samsung,pmu-syscon",
+		"samsung,disp-sysreg",
+		"samsung,cam0-sysreg",
+		"samsung,cam1-sysreg"
+	},
+	.num_phys = 5,
+	.phys = {
+		{
+			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = EXYNOS5433_SYSREG_DISP_MIPI_PHY,
+			.resetn_map = EXYNOS_MIPI_REGMAP_DISP,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(1),
+			.resetn_reg = EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(1),
+			.resetn_reg = EXYNOS5433_SYSREG_DISP_MIPI_PHY,
+			.resetn_map = EXYNOS_MIPI_REGMAP_DISP,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY2_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = EXYNOS5433_SYSREG_CAM1_MIPI_DPHY_CON,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM1,
+		},
+	},
+};
 
 struct exynos_mipi_video_phy {
 	struct regmap *regmaps[EXYNOS_MIPI_REGMAPS_NUM];
@@ -241,6 +362,12 @@ static const struct of_device_id exynos_mipi_video_phy_of_match[] = {
 	{
 		.compatible = "samsung,s5pv210-mipi-video-phy",
 		.data = &s5pv210_mipi_phy,
+	}, {
+		.compatible = "samsung,exynos5420-mipi-video-phy",
+		.data = &exynos5420_mipi_phy,
+	}, {
+		.compatible = "samsung,exynos5433-mipi-video-phy",
+		.data = &exynos5433_mipi_phy,
 	},
 	{ /* sentinel */ },
 };
diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h
index 9352adc95de6..76f30f940c70 100644
--- a/include/linux/mfd/syscon/exynos5-pmu.h
+++ b/include/linux/mfd/syscon/exynos5-pmu.h
@@ -38,6 +38,9 @@
 
 /* Exynos5433 specific register definitions */
 #define EXYNOS5433_USBHOST30_PHY_CONTROL	(0x728)
+#define EXYNOS5433_MIPI_PHY0_CONTROL		(0x710)
+#define EXYNOS5433_MIPI_PHY1_CONTROL		(0x714)
+#define EXYNOS5433_MIPI_PHY2_CONTROL		(0x718)
 
 #define EXYNOS5_PHY_ENABLE			BIT(0)
 
-- 
cgit v1.2.3


From 18900ca65a8553edc608b6c9d518eb31e6c09ba1 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:06:48 -0700
Subject: tty: Replace TTY_IO_ERROR bit tests with tty_io_error()

Abstract TTY_IO_ERROR status test treewide with tty_io_error().
NB: tty->flags uses atomic bit ops; replace non-atomic bit test
with test_bit().

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/ia64/hp/sim/simserial.c       | 2 +-
 drivers/char/pcmcia/synclink_cs.c  | 2 +-
 drivers/isdn/i4l/isdn_tty.c        | 6 +++---
 drivers/s390/char/tty3270.c        | 4 ++--
 drivers/staging/dgnc/dgnc_tty.c    | 2 +-
 drivers/tty/amiserial.c            | 6 +++---
 drivers/tty/mxser.c                | 7 +++----
 drivers/tty/pty.c                  | 2 +-
 drivers/tty/serial/crisv10.c       | 5 ++---
 drivers/tty/serial/serial_core.c   | 8 ++++----
 drivers/tty/synclink.c             | 4 ++--
 drivers/tty/synclink_gt.c          | 4 ++--
 drivers/tty/synclinkmp.c           | 4 ++--
 drivers/tty/tty_io.c               | 5 ++---
 drivers/tty/tty_port.c             | 2 +-
 include/linux/tty.h                | 5 +++++
 net/irda/ircomm/ircomm_tty.c       | 2 +-
 net/irda/ircomm/ircomm_tty_ioctl.c | 6 +++---
 18 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index e70cadec7ce6..21fd50def270 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -300,7 +300,7 @@ static int rs_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 22c27652e46a..825db423b7a8 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2246,7 +2246,7 @@ static int mgslpc_ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 947d5c978b8f..f1edc0814120 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1351,7 +1351,7 @@ isdn_tty_tiocmget(struct tty_struct *tty)
 
 	if (isdn_tty_paranoia_check(info, tty->name, __func__))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	mutex_lock(&modem_info_mutex);
@@ -1378,7 +1378,7 @@ isdn_tty_tiocmset(struct tty_struct *tty,
 
 	if (isdn_tty_paranoia_check(info, tty->name, __func__))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 #ifdef ISDN_DEBUG_MODEM_IOCTL
@@ -1419,7 +1419,7 @@ isdn_tty_ioctl(struct tty_struct *tty, uint cmd, ulong arg)
 
 	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_ioctl"))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 	switch (cmd) {
 	case TCSBRK:   /* SVID version: non-zero arg --> no break */
diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index e96fc7fd9498..080a9872c68f 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -1804,7 +1804,7 @@ static int tty3270_ioctl(struct tty_struct *tty, unsigned int cmd,
 	tp = tty->driver_data;
 	if (!tp)
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 	return kbd_ioctl(tp->kbd, cmd, arg);
 }
@@ -1818,7 +1818,7 @@ static long tty3270_compat_ioctl(struct tty_struct *tty,
 	tp = tty->driver_data;
 	if (!tp)
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 	return kbd_ioctl(tp->kbd, cmd, (unsigned long)compat_ptr(arg));
 }
diff --git a/drivers/staging/dgnc/dgnc_tty.c b/drivers/staging/dgnc/dgnc_tty.c
index bcd2bdfb9c8f..5c221593a0c6 100644
--- a/drivers/staging/dgnc/dgnc_tty.c
+++ b/drivers/staging/dgnc/dgnc_tty.c
@@ -1255,7 +1255,7 @@ static int dgnc_block_til_ready(struct tty_struct *tty,
 			if (file->f_flags & O_NONBLOCK)
 				break;
 
-			if (tty->flags & (1 << TTY_IO_ERROR)) {
+			if (tty_io_error(tty)) {
 				retval = -EIO;
 				break;
 			}
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index eacf4c9f3b29..183e98e84d09 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1143,7 +1143,7 @@ static int rs_tiocmget(struct tty_struct *tty)
 
 	if (serial_paranoia_check(info, tty->name, "rs_ioctl"))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	control = info->MCR;
@@ -1165,7 +1165,7 @@ static int rs_tiocmset(struct tty_struct *tty, unsigned int set,
 
 	if (serial_paranoia_check(info, tty->name, "rs_ioctl"))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	local_irq_save(flags);
@@ -1250,7 +1250,7 @@ static int rs_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 2f12bb9f4336..f23c2a101688 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1334,7 +1334,7 @@ static int mxser_tiocmget(struct tty_struct *tty)
 
 	if (tty->index == MXSER_PORTS)
 		return -ENOIOCTLCMD;
-	if (test_bit(TTY_IO_ERROR, &tty->flags))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	control = info->MCR;
@@ -1361,7 +1361,7 @@ static int mxser_tiocmset(struct tty_struct *tty,
 
 	if (tty->index == MXSER_PORTS)
 		return -ENOIOCTLCMD;
-	if (test_bit(TTY_IO_ERROR, &tty->flags))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	spin_lock_irqsave(&info->slock, flags);
@@ -1715,8 +1715,7 @@ static int mxser_ioctl(struct tty_struct *tty,
 		return 0;
 	}
 
-	if (cmd != TIOCGSERIAL && cmd != TIOCMIWAIT &&
-			test_bit(TTY_IO_ERROR, &tty->flags))
+	if (cmd != TIOCGSERIAL && cmd != TIOCMIWAIT && tty_io_error(tty))
 		return -EIO;
 
 	switch (cmd) {
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 0058d9fbf931..a8a292fd564f 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -44,7 +44,7 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 	if (tty->driver->subtype == PTY_TYPE_MASTER)
 		WARN_ON(tty->count > 1);
 	else {
-		if (test_bit(TTY_IO_ERROR, &tty->flags))
+		if (tty_io_error(tty))
 			return;
 		if (tty->count > 2)
 			return;
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index c0172bf54a9b..546990334815 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -3445,7 +3445,7 @@ rs_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGWILD)  &&
 	    (cmd != TIOCSERSWILD) && (cmd != TIOCSERGSTRUCT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 			return -EIO;
 	}
 
@@ -3755,8 +3755,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 	 * If non-blocking mode is set, or the port is not enabled,
 	 * then make the check up front and then exit.
 	 */
-	if ((filp->f_flags & O_NONBLOCK) ||
-	    (tty->flags & (1 << TTY_IO_ERROR))) {
+	if ((filp->f_flags & O_NONBLOCK) || tty_io_error(tty)) {
 		info->port.flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index a126a603b083..67b395031347 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -969,7 +969,7 @@ static int uart_tiocmget(struct tty_struct *tty)
 	int result = -EIO;
 
 	mutex_lock(&port->mutex);
-	if (!(tty->flags & (1 << TTY_IO_ERROR))) {
+	if (!tty_io_error(tty)) {
 		result = uport->mctrl;
 		spin_lock_irq(&uport->lock);
 		result |= uport->ops->get_mctrl(uport);
@@ -989,7 +989,7 @@ uart_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
 	int ret = -EIO;
 
 	mutex_lock(&port->mutex);
-	if (!(tty->flags & (1 << TTY_IO_ERROR))) {
+	if (!tty_io_error(tty)) {
 		uart_update_mctrl(uport, set, clear);
 		ret = 0;
 	}
@@ -1238,7 +1238,7 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd,
 	if (ret != -ENOIOCTLCMD)
 		goto out;
 
-	if (tty->flags & (1 << TTY_IO_ERROR)) {
+	if (tty_io_error(tty)) {
 		ret = -EIO;
 		goto out;
 	}
@@ -1257,7 +1257,7 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd,
 
 	mutex_lock(&port->mutex);
 
-	if (tty->flags & (1 << TTY_IO_ERROR)) {
+	if (tty_io_error(tty)) {
 		ret = -EIO;
 		goto out_up;
 	}
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index f5476e270734..8b2277223ee7 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -2972,7 +2972,7 @@ static int mgsl_ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
@@ -3270,7 +3270,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 		printk("%s(%d):block_til_ready on %s\n",
 			 __FILE__,__LINE__, tty->driver->name );
 
-	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index c0a2f5a1b1c2..1f7d6d9437e6 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1032,7 +1032,7 @@ static int ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
@@ -3269,7 +3269,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	DBGINFO(("%s block_til_ready\n", tty->driver->name));
 
-	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 90da0c712262..e93879944905 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -1261,7 +1261,7 @@ static int ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
@@ -3285,7 +3285,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		printk("%s(%d):%s block_til_ready()\n",
 			 __FILE__,__LINE__, tty->driver->name );
 
-	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		/* just verify that callout device is not active */
 		port->flags |= ASYNC_NORMAL_ACTIVE;
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 9b04d72e752e..320dc4da7162 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1070,7 +1070,7 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 
 	if (tty_paranoia_check(tty, inode, "tty_read"))
 		return -EIO;
-	if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags)))
+	if (!tty || tty_io_error(tty))
 		return -EIO;
 
 	/* We want to wait for the line discipline to sort out in this
@@ -1245,8 +1245,7 @@ static ssize_t tty_write(struct file *file, const char __user *buf,
 
 	if (tty_paranoia_check(tty, file_inode(file), "tty_write"))
 		return -EIO;
-	if (!tty || !tty->ops->write ||
-		(test_bit(TTY_IO_ERROR, &tty->flags)))
+	if (!tty || !tty->ops->write ||	tty_io_error(tty))
 			return -EIO;
 	/* Short term debug to catch buggy drivers */
 	if (tty->ops->write_room == NULL)
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index dbcca30a54b1..9127c54b803e 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -364,7 +364,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	/* if non-blocking mode is set we can pass directly to open unless
 	   the port has just hung up or is in another error state */
-	if (tty->flags & (1 << TTY_IO_ERROR)) {
+	if (tty_io_error(tty)) {
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3b09f235db66..68d829bf93b8 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -360,6 +360,11 @@ static inline void tty_set_flow_change(struct tty_struct *tty, int val)
 	smp_mb();
 }
 
+static inline bool tty_io_error(struct tty_struct *tty)
+{
+	return test_bit(TTY_IO_ERROR, &tty->flags);
+}
+
 #ifdef CONFIG_TTY
 extern void console_init(void);
 extern void tty_kref_put(struct tty_struct *tty);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index da126ee6d218..840b82f760ba 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -280,7 +280,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	 * If non-blocking mode is set, or the port is not enabled,
 	 * then make the check up front and then exit.
 	 */
-	if (test_bit(TTY_IO_ERROR, &tty->flags)) {
+	if (tty_io_error(tty)) {
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index d3687aaa23de..9beb011441fa 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -190,7 +190,7 @@ int ircomm_tty_tiocmget(struct tty_struct *tty)
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
 	unsigned int result;
 
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	result =  ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
@@ -213,7 +213,7 @@ int ircomm_tty_tiocmset(struct tty_struct *tty,
 {
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
 
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	IRDA_ASSERT(self != NULL, return -1;);
@@ -362,7 +362,7 @@ int ircomm_tty_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
-- 
cgit v1.2.3


From 97ef38b8210d7459d4cb51668cdf3983772ac6b7 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:11:36 -0700
Subject: tty: Replace TTY_THROTTLED bit tests with tty_throttled()

Abstract TTY_THROTTLED bit tests with tty_throttled().

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c      | 2 +-
 drivers/mmc/card/sdio_uart.c           | 2 +-
 drivers/net/usb/hso.c                  | 2 +-
 drivers/staging/fwserial/fwserial.c    | 2 +-
 drivers/staging/speakup/selection.c    | 2 +-
 drivers/tty/amiserial.c                | 2 +-
 drivers/tty/hvc/hvc_console.c          | 2 +-
 drivers/tty/hvc/hvcs.c                 | 2 +-
 drivers/tty/hvc/hvsi.c                 | 2 +-
 drivers/tty/moxa.c                     | 2 +-
 drivers/tty/nozomi.c                   | 2 +-
 drivers/tty/serial/serial_core.c       | 2 +-
 drivers/tty/synclink.c                 | 2 +-
 drivers/tty/synclink_gt.c              | 2 +-
 drivers/tty/synclinkmp.c               | 2 +-
 drivers/tty/tty_ioctl.c                | 4 ++--
 drivers/tty/vt/selection.c             | 2 +-
 drivers/usb/gadget/function/u_serial.c | 4 ++--
 drivers/usb/serial/digi_acceleport.c   | 3 +--
 include/linux/tty.h                    | 5 +++++
 net/irda/ircomm/ircomm_tty_ioctl.c     | 2 +-
 21 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 825db423b7a8..bcae5bb15751 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2316,7 +2316,7 @@ static void mgslpc_set_termios(struct tty_struct *tty, struct ktermios *old_term
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->serial_signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->lock, flags);
 		set_signals(info);
diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c
index 5415056f9aa5..5af6fb9a9ce2 100644
--- a/drivers/mmc/card/sdio_uart.c
+++ b/drivers/mmc/card/sdio_uart.c
@@ -895,7 +895,7 @@ static void sdio_uart_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		unsigned int mask = TIOCM_DTR;
-		if (!(cflag & CRTSCTS) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!(cflag & CRTSCTS) || !tty_throttled(tty))
 			mask |= TIOCM_RTS;
 		sdio_uart_set_mctrl(port, mask);
 	}
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 111d907e0c11..4b4458616693 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2029,7 +2029,7 @@ static int put_rxbuf_data(struct urb *urb, struct hso_serial *serial)
 
 	tty = tty_port_tty_get(&serial->port);
 
-	if (tty && test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty && tty_throttled(tty)) {
 		tty_kref_put(tty);
 		return -1;
 	}
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index 9b23b5c95f5e..1f9389d8c152 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -1305,7 +1305,7 @@ static void fwtty_set_termios(struct tty_struct *tty, struct ktermios *old)
 	if ((baud == 0) && (old->c_cflag & CBAUD)) {
 		port->mctrl &= ~(TIOCM_DTR | TIOCM_RTS);
 	} else if ((baud != 0) && !(old->c_cflag & CBAUD)) {
-		if (C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (C_CRTSCTS(tty) || !tty_throttled(tty))
 			port->mctrl |= TIOCM_DTR | TIOCM_RTS;
 		else
 			port->mctrl |= TIOCM_DTR;
diff --git a/drivers/staging/speakup/selection.c b/drivers/staging/speakup/selection.c
index 41ef099b7aa6..0149edc1e0ae 100644
--- a/drivers/staging/speakup/selection.c
+++ b/drivers/staging/speakup/selection.c
@@ -150,7 +150,7 @@ static void __speakup_paste_selection(struct work_struct *work)
 	add_wait_queue(&vc->paste_wait, &wait);
 	while (sel_buffer && sel_buffer_lth > pasted) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (tty_throttled(tty)) {
 			schedule();
 			continue;
 		}
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 183e98e84d09..e68208eac322 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1342,7 +1342,7 @@ static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		info->MCR |= SER_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->MCR |= SER_RTS;
 		local_irq_save(flags);
 		rtsdtr_ctrl(info->MCR);
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index e46d628998f5..209dad8c96a0 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -632,7 +632,7 @@ int hvc_poll(struct hvc_struct *hp)
 		goto bail;
 
 	/* Now check if we can get data (are we throttled ?) */
-	if (test_bit(TTY_THROTTLED, &tty->flags))
+	if (tty_throttled(tty))
 		goto throttled;
 
 	/* If we aren't notifier driven and aren't throttled, we always
diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c
index 5997b1731111..3c4d7c2b4ade 100644
--- a/drivers/tty/hvc/hvcs.c
+++ b/drivers/tty/hvc/hvcs.c
@@ -600,7 +600,7 @@ static int hvcs_io(struct hvcs_struct *hvcsd)
 
 	hvcs_try_write(hvcsd);
 
-	if (!tty || test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (!tty || tty_throttled(tty)) {
 		hvcsd->todo_mask &= ~(HVCS_READ_MASK);
 		goto bail;
 	} else if (!(hvcsd->todo_mask & (HVCS_READ_MASK)))
diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c
index a75146f600cb..96ce6bd1cc6f 100644
--- a/drivers/tty/hvc/hvsi.c
+++ b/drivers/tty/hvc/hvsi.c
@@ -509,7 +509,7 @@ static irqreturn_t hvsi_interrupt(int irq, void *arg)
 	}
 
 	spin_lock_irqsave(&hp->lock, flags);
-	if (tty && hp->n_throttle && !test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty && hp->n_throttle && !tty_throttled(tty)) {
 		/* we weren't hung up and we weren't throttled, so we can
 		 * deliver the rest now */
 		hvsi_send_overflow(hp);
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 92982d7c0489..ce521d3f58cb 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -1394,7 +1394,7 @@ static int moxa_poll_port(struct moxa_port *p, unsigned int handle,
 			tty_wakeup(tty);
 		}
 
-		if (inited && !test_bit(TTY_THROTTLED, &tty->flags) &&
+		if (inited && !tty_throttled(tty) &&
 				MoxaPortRxQueue(p) > 0) { /* RX */
 			MoxaPortReadData(p);
 			tty_schedule_flip(&p->port);
diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 5cc80b80c82b..d6fd0e802ef5 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -826,7 +826,7 @@ static int receive_data(enum port_type index, struct nozomi *dc)
 	size = __le32_to_cpu(readl(addr));
 	/*  DBG1( "%d bytes port: %d", size, index); */
 
-	if (tty && test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty && tty_throttled(tty)) {
 		DBG1("No room in tty, don't read data, don't ack interrupt, "
 			"disable interrupt");
 
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 67b395031347..64a5c00d7468 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1350,7 +1350,7 @@ static void uart_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	else if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		unsigned int mask = TIOCM_DTR;
-		if (!(cflag & CRTSCTS) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!(cflag & CRTSCTS) || !tty_throttled(tty))
 			mask |= TIOCM_RTS;
 		uart_set_mctrl(uport, mask);
 	}
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 8b2277223ee7..3768e5c71c0b 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -3049,7 +3049,7 @@ static void mgsl_set_termios(struct tty_struct *tty, struct ktermios *old_termio
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->serial_signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->irq_spinlock,flags);
 	 	usc_set_serial_signals(info);
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 1f7d6d9437e6..ceeaeb703f51 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -784,7 +784,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->signals |= SerialSignal_DTR;
- 		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index e93879944905..b0cce4b24e51 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -881,7 +881,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
- 		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->serial_signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 23bf5bb1d8bf..bf36ac9aee41 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -158,7 +158,7 @@ int tty_throttle_safe(struct tty_struct *tty)
 	int ret = 0;
 
 	mutex_lock(&tty->throttle_mutex);
-	if (!test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (!tty_throttled(tty)) {
 		if (tty->flow_change != TTY_THROTTLE_SAFE)
 			ret = 1;
 		else {
@@ -189,7 +189,7 @@ int tty_unthrottle_safe(struct tty_struct *tty)
 	int ret = 0;
 
 	mutex_lock(&tty->throttle_mutex);
-	if (test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty_throttled(tty)) {
 		if (tty->flow_change != TTY_UNTHROTTLE_SAFE)
 			ret = 1;
 		else {
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index 4dd9dd2270a0..368ce1803e8f 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -354,7 +354,7 @@ int paste_selection(struct tty_struct *tty)
 	add_wait_queue(&vc->paste_wait, &wait);
 	while (sel_buffer && sel_buffer_lth > pasted) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (tty_throttled(tty)) {
 			schedule();
 			continue;
 		}
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 6af145f2a99d..3580f198df8b 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -512,7 +512,7 @@ static void gs_rx_push(unsigned long _port)
 		req = list_first_entry(queue, struct usb_request, list);
 
 		/* leave data queued if tty was rx throttled */
-		if (tty && test_bit(TTY_THROTTLED, &tty->flags))
+		if (tty && tty_throttled(tty))
 			break;
 
 		switch (req->status) {
@@ -579,7 +579,7 @@ static void gs_rx_push(unsigned long _port)
 	 * from starving ... but it's not clear that case ever happens.
 	 */
 	if (!list_empty(queue) && tty) {
-		if (!test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (!tty_throttled(tty)) {
 			if (do_push)
 				tasklet_schedule(&port->push);
 			else
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 16e8e37b3b36..6a1df9e824ca 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -699,8 +699,7 @@ static void digi_set_termios(struct tty_struct *tty,
 			/* don't set RTS if using hardware flow control */
 			/* and throttling input */
 			modem_signals = TIOCM_DTR;
-			if (!C_CRTSCTS(tty) ||
-			    !test_bit(TTY_THROTTLED, &tty->flags))
+			if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 				modem_signals |= TIOCM_RTS;
 			digi_set_modem_signals(port, modem_signals, 1);
 		}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 68d829bf93b8..89f9c91b40f5 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -365,6 +365,11 @@ static inline bool tty_io_error(struct tty_struct *tty)
 	return test_bit(TTY_IO_ERROR, &tty->flags);
 }
 
+static inline bool tty_throttled(struct tty_struct *tty)
+{
+	return test_bit(TTY_THROTTLED, &tty->flags);
+}
+
 #ifdef CONFIG_TTY
 extern void console_init(void);
 extern void tty_kref_put(struct tty_struct *tty);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 9beb011441fa..8d8fd28ff4d9 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -166,7 +166,7 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		self->settings.dte |= IRCOMM_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			self->settings.dte |= IRCOMM_RTS;
 		ircomm_param_request(self, IRCOMM_DTE, TRUE);
 	}
-- 
cgit v1.2.3


From e4d38f334ad24f80229a8ebab26950de8e8f34d7 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:20 -0700
Subject: tty: Define ASYNC_ replacement bits

Prepare for relocating kernel private state bits out of tty_port::flags
field; tty_port::flags field is not atomic and can become corrupted
by concurrent updates. It also suffers from the complication of sharing
in a userspace-visible field which must be masked.

Define new tty_port::iflags field and new, substitute bit definitions
for the former ASYNC_* flags.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h            | 16 +++++++++++++++-
 include/uapi/linux/tty_flags.h |  9 ++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 89f9c91b40f5..4e0dbda05180 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -228,7 +228,8 @@ struct tty_port {
 	int			count;		/* Usage count */
 	wait_queue_head_t	open_wait;	/* Open waiters */
 	wait_queue_head_t	delta_msr_wait;	/* Modem status change */
-	unsigned long		flags;		/* TTY flags ASY_*/
+	unsigned long		flags;		/* User TTY flags ASYNC_ */
+	unsigned long		iflags;		/* Internal flags TTY_PORT_ */
 	unsigned char		console:1,	/* port is a console */
 				low_latency:1;	/* optional: tune for latency */
 	struct mutex		mutex;		/* Locking */
@@ -242,6 +243,19 @@ struct tty_port {
 	struct kref		kref;		/* Ref counter */
 };
 
+/* tty_port::iflags bits -- use atomic bit ops */
+#define TTY_PORT_INITIALIZED	0	/* device is initialized */
+#define TTY_PORT_SUSPENDED	1	/* device is suspended */
+#define TTY_PORT_ACTIVE		2	/* device is open */
+
+/*
+ * uart drivers: use the uart_port::status field and the UPSTAT_* defines
+ * for s/w-based flow control steering and carrier detection status
+ */
+#define TTY_PORT_CTS_FLOW	3	/* h/w flow control enabled */
+#define TTY_PORT_CHECK_CD	4	/* carrier detect enabled */
+
+
 /*
  * Where all of the state associated with a tty is kept while the tty
  * is open.  Since the termios state should be kept even if the tty
diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h
index 072e41e45ee2..8e1a4365259f 100644
--- a/include/uapi/linux/tty_flags.h
+++ b/include/uapi/linux/tty_flags.h
@@ -32,7 +32,12 @@
 #define ASYNCB_MAGIC_MULTIPLIER	16 /* Use special CLK or divisor */
 #define ASYNCB_LAST_USER	16
 
-/* Internal flags used only by kernel */
+/*
+ * Internal flags used only by kernel (read-only)
+ *
+ * WARNING: These flags are no longer used and have been superceded by the
+ *	    TTY_PORT_ flags in the iflags field (and not userspace-visible)
+ */
 #define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
 #define ASYNCB_SUSPENDED	30 /* Serial port is suspended */
 #define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
@@ -44,6 +49,7 @@
 #define ASYNCB_CONS_FLOW	23 /* flow control for console  */
 #define ASYNCB_FIRST_KERNEL	22
 
+/* Masks */
 #define ASYNC_HUP_NOTIFY	(1U << ASYNCB_HUP_NOTIFY)
 #define ASYNC_SUSPENDED		(1U << ASYNCB_SUSPENDED)
 #define ASYNC_FOURPORT		(1U << ASYNCB_FOURPORT)
@@ -72,6 +78,7 @@
 #define ASYNC_SPD_WARP		(ASYNC_SPD_HI|ASYNC_SPD_SHI)
 #define ASYNC_SPD_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
 
+/* These flags are no longer used (and were always masked from userspace) */
 #define ASYNC_INITIALIZED	(1U << ASYNCB_INITIALIZED)
 #define ASYNC_NORMAL_ACTIVE	(1U << ASYNCB_NORMAL_ACTIVE)
 #define ASYNC_BOOT_AUTOCONF	(1U << ASYNCB_BOOT_AUTOCONF)
-- 
cgit v1.2.3


From 5604a98e2f95d6221852960a3363588f40d78e22 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:21 -0700
Subject: tty: Replace ASYNC_CTS_FLOW bit and update atomically

Replace ASYNC_CTS_FLOW bit in the tty_port::flags field with
TTY_PORT_CTS_FLOW bit in the tty_port::iflags field. Add
tty_port_set_cts_flow() helper to abstract the atomic bit ops.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c  |  5 +----
 drivers/tty/amiserial.c            |  6 ++----
 drivers/tty/cyclades.c             | 10 ++++------
 drivers/tty/isicom.c               |  6 ++----
 drivers/tty/mxser.c                |  4 +---
 drivers/tty/synclink.c             |  7 ++-----
 drivers/tty/synclink_gt.c          |  5 +----
 drivers/tty/synclinkmp.c           |  5 +----
 include/linux/tty.h                | 12 ++++++++++--
 net/irda/ircomm/ircomm_tty_ioctl.c |  3 +--
 10 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index bcae5bb15751..bdf41ac613dc 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1466,10 +1466,7 @@ static void mgslpc_change_params(MGSLPC_INFO *info, struct tty_struct *tty)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index e68208eac322..92717b088959 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -727,11 +727,9 @@ static void change_speed(struct tty_struct *tty, struct serial_state *info,
 	info->IER &= ~UART_IER_MSI;
 	if (port->flags & ASYNC_HARDPPS_CD)
 		info->IER |= UART_IER_MSI;
-	if (cflag & CRTSCTS) {
-		port->flags |= ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(port, cflag & CRTSCTS);
+	if (cflag & CRTSCTS)
 		info->IER |= UART_IER_MSI;
-	} else
-		port->flags &= ~ASYNC_CTS_FLOW;
 	if (cflag & CLOCAL)
 		port->flags &= ~ASYNC_CHECK_CD;
 	else {
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index d67e542bab1c..1a12776ba24c 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -2083,13 +2083,11 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 			info->cor1 |= CyPARITY_NONE;
 
 		/* CTS flow control flag */
-		if (cflag & CRTSCTS) {
-			info->port.flags |= ASYNC_CTS_FLOW;
+		tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
+		if (cflag & CRTSCTS)
 			info->cor2 |= CyCtsAE;
-		} else {
-			info->port.flags &= ~ASYNC_CTS_FLOW;
+		else
 			info->cor2 &= ~CyCtsAE;
-		}
 		if (cflag & CLOCAL)
 			info->port.flags &= ~ASYNC_CHECK_CD;
 		else
@@ -2234,7 +2232,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 		}
 		/* As the HW flow control is done in firmware, the driver
 		   doesn't need to care about it */
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+		tty_port_set_cts_flow(&info->port, 0);
 
 		/* XON/XOFF/XANY flow control flags */
 		sw_flow = 0;
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 8bf67630018b..c5f06b54b9ca 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -765,11 +765,9 @@ static void isicom_config_port(struct tty_struct *tty)
 
 	/* flow control settings ...*/
 	flow_ctrl = 0;
-	port->port.flags &= ~ASYNC_CTS_FLOW;
-	if (C_CRTSCTS(tty)) {
-		port->port.flags |= ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&port->port, C_CRTSCTS(tty));
+	if (C_CRTSCTS(tty))
 		flow_ctrl |= ISICOM_CTSRTS;
-	}
 	if (I_IXON(tty))
 		flow_ctrl |= ISICOM_RESPOND_XONXOFF;
 	if (I_IXOFF(tty))
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index f23c2a101688..8f3fdad37ac7 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -711,8 +711,8 @@ static int mxser_change_speed(struct tty_struct *tty,
 	/* CTS flow control flag and modem status interrupts */
 	info->IER &= ~UART_IER_MSI;
 	info->MCR &= ~UART_MCR_AFE;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS) {
-		info->port.flags |= ASYNC_CTS_FLOW;
 		info->IER |= UART_IER_MSI;
 		if ((info->type == PORT_16550A) || (info->board->chip_flag)) {
 			info->MCR |= UART_MCR_AFE;
@@ -744,8 +744,6 @@ static int mxser_change_speed(struct tty_struct *tty,
 				}
 			}
 		}
-	} else {
-		info->port.flags &= ~ASYNC_CTS_FLOW;
 	}
 	outb(info->MCR, info->ioaddr + UART_MCR);
 	if (cflag & CLOCAL) {
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 3768e5c71c0b..0e4290183280 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -1966,11 +1966,8 @@ static void mgsl_change_params(struct mgsl_struct *info)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
-		
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
+
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
 	else
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index ceeaeb703f51..5da69d30f816 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -2576,10 +2576,7 @@ static void change_params(struct slgt_info *info)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index b0cce4b24e51..7a21491d0c0d 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -2813,10 +2813,7 @@ static void change_params(SLMP_INFO *info)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4e0dbda05180..989d755b0ae4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -255,7 +255,6 @@ struct tty_port {
 #define TTY_PORT_CTS_FLOW	3	/* h/w flow control enabled */
 #define TTY_PORT_CHECK_CD	4	/* carrier detect enabled */
 
-
 /*
  * Where all of the state associated with a tty is kept while the tty
  * is open.  Since the termios state should be kept even if the tty
@@ -561,9 +560,18 @@ static inline struct tty_port *tty_port_get(struct tty_port *port)
 /* If the cts flow control is enabled, return true. */
 static inline bool tty_port_cts_enabled(struct tty_port *port)
 {
-	return port->flags & ASYNC_CTS_FLOW;
+	return test_bit(TTY_PORT_CTS_FLOW, &port->iflags);
 }
 
+static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_CTS_FLOW, &port->iflags);
+	else
+		clear_bit(TTY_PORT_CTS_FLOW, &port->iflags);
+}
+
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 8d8fd28ff4d9..1220973c7c43 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -86,15 +86,14 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
 	ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
 
 	/* CTS flow control flag and modem status interrupts */
+	tty_port_set_cts_flow(&self->port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS) {
-		self->port.flags |= ASYNC_CTS_FLOW;
 		self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
 		/* This got me. Bummer. Jean II */
 		if (self->service_type == IRCOMM_3_WIRE_RAW)
 			net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n",
 					     __func__);
 	} else {
-		self->port.flags &= ~ASYNC_CTS_FLOW;
 		self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
 	}
 	if (cflag & CLOCAL)
-- 
cgit v1.2.3


From 807c8d81f4ec441241cafa3034c58df721fee869 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:22 -0700
Subject: tty: Replace ASYNC_NORMAL_ACTIVE bit and update atomically

Replace ASYNC_NORMAL_ACTIVE bit in the tty_port::flags field with
TTY_PORT_ACTIVE bit in the tty_port::iflags field. Introduce helpers
tty_port_set_active() and tty_port_active() to abstract atomic bit ops.

Extract state changes from port lock sections, as this usage is
broken and confused; the state transitions are protected by the
tty lock (which mutually excludes parallel open/close/hangup),
and no user tests the active state while holding the port lock.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/i4l/isdn_tty.c      | 20 +++++++++-----------
 drivers/tty/amiserial.c          |  2 +-
 drivers/tty/rocket.c             |  5 +++--
 drivers/tty/serial/crisv10.c     |  8 ++++----
 drivers/tty/serial/serial_core.c |  8 ++++----
 drivers/tty/synclink.c           |  6 +++---
 drivers/tty/synclink_gt.c        |  6 +++---
 drivers/tty/synclinkmp.c         |  6 +++---
 drivers/tty/tty_port.c           | 12 ++++++------
 include/linux/tty.h              | 12 ++++++++++++
 net/irda/ircomm/ircomm_tty.c     | 10 +++++-----
 11 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index f1edc0814120..d8468f3529a7 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1622,7 +1622,7 @@ isdn_tty_hangup(struct tty_struct *tty)
 		return;
 	isdn_tty_shutdown(info);
 	port->count = 0;
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(port, 0);
 	port->tty = NULL;
 	wake_up_interruptible(&port->open_wait);
 }
@@ -1979,7 +1979,7 @@ isdn_tty_find_icall(int di, int ch, setup_parm *setup)
 #endif
 			if (
 #ifndef FIX_FILE_TRANSFER
-				(info->port.flags & ASYNC_NORMAL_ACTIVE) &&
+			    tty_port_active(&info->port) &&
 #endif
 				(info->isdn_driver == -1) &&
 				(info->isdn_channel == -1) &&
@@ -2018,8 +2018,6 @@ isdn_tty_find_icall(int di, int ch, setup_parm *setup)
 	return (wret == 2) ? 3 : 0;
 }
 
-#define TTY_IS_ACTIVE(info)	(info->port.flags & ASYNC_NORMAL_ACTIVE)
-
 int
 isdn_tty_stat_callback(int i, isdn_ctrl *c)
 {
@@ -2077,7 +2075,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_DCONN ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				if (info->dialing == 1) {
 					info->dialing = 2;
 					return 1;
@@ -2088,7 +2086,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_DHUP ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				if (info->dialing == 1)
 					isdn_tty_modem_result(RESULT_BUSY, info);
 				if (info->dialing > 1)
@@ -2118,7 +2116,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 			 * waiting for it and
 			 * set DCD-bit of its modem-status.
 			 */
-			if (TTY_IS_ACTIVE(info) ||
+			if (tty_port_active(&info->port) ||
 			    (info->port.blocked_open &&
 			     (info->emu.mdmreg[REG_DCD] & BIT_DCD))) {
 				info->msr |= UART_MSR_DCD;
@@ -2145,7 +2143,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_BHUP ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 #ifdef ISDN_DEBUG_MODEM_HUP
 				printk(KERN_DEBUG "Mhup in ISDN_STAT_BHUP\n");
 #endif
@@ -2157,7 +2155,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_NODCH ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				if (info->dialing) {
 					info->dialing = 0;
 					info->last_l2 = -1;
@@ -2183,14 +2181,14 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 			return 1;
 #ifdef CONFIG_ISDN_TTY_FAX
 		case ISDN_STAT_FAXIND:
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				isdn_tty_fax_command(info, c);
 			}
 			break;
 #endif
 #ifdef CONFIG_ISDN_AUDIO
 		case ISDN_STAT_AUDIO:
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				switch (c->parm.num[0]) {
 				case ISDN_AUDIO_DTMF:
 					if (info->vonline) {
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 92717b088959..80d61658efb0 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1493,7 +1493,7 @@ static void rs_hangup(struct tty_struct *tty)
 	rs_flush_buffer(tty);
 	shutdown(tty, info);
 	info->tport.count = 0;
-	info->tport.flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->tport, 0);
 	info->tport.tty = NULL;
 	wake_up_interruptible(&info->tport.open_wait);
 }
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 0b802cdd70d0..eb8311b20782 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -1042,9 +1042,10 @@ static void rp_close(struct tty_struct *tty, struct file *filp)
 		}
 	}
 	spin_lock_irq(&port->lock);
-	info->port.flags &= ~(ASYNC_INITIALIZED | ASYNC_NORMAL_ACTIVE);
+	port->flags &= ~ASYNC_INITIALIZED;
 	tty->closing = 0;
 	spin_unlock_irq(&port->lock);
+	tty_port_set_active(port, 0);
 	mutex_unlock(&port->mutex);
 	tty_port_tty_set(port, NULL);
 
@@ -1624,7 +1625,7 @@ static int rp_write(struct tty_struct *tty,
 	/*  Write remaining data into the port's xmit_buf */
 	while (1) {
 		/* Hung up ? */
-		if (!test_bit(ASYNCB_NORMAL_ACTIVE, &info->port.flags))
+		if (!tty_port_active(&info->port))
 			goto end;
 		c = min(count, XMIT_BUF_SIZE - info->xmit_cnt - 1);
 		c = min(c, XMIT_BUF_SIZE - info->xmit_head);
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 546990334815..92c8c628e00e 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -3648,8 +3648,8 @@ rs_close(struct tty_struct *tty, struct file * filp)
 			schedule_timeout_interruptible(info->port.close_delay);
 		wake_up_interruptible(&info->port.open_wait);
 	}
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	local_irq_restore(flags);
+	tty_port_set_active(&info->port, 0);
 
 	/* port closed */
 
@@ -3732,7 +3732,7 @@ rs_hangup(struct tty_struct *tty)
 	shutdown(info);
 	info->event = 0;
 	info->port.count = 0;
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->port, 0);
 	info->port.tty = NULL;
 	wake_up_interruptible(&info->port.open_wait);
 }
@@ -3756,7 +3756,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 	 * then make the check up front and then exit.
 	 */
 	if ((filp->f_flags & O_NONBLOCK) || tty_io_error(tty)) {
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(&info->port, 1);
 		return 0;
 	}
 
@@ -3825,7 +3825,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 #endif
 	if (retval)
 		return retval;
-	info->port.flags |= ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->port, 1);
 	return 0;
 }
 
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 64a5c00d7468..2471380fb92e 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1418,12 +1418,12 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 		uart_change_pm(state, UART_PM_STATE_OFF);
 		spin_lock_irq(&port->lock);
 	}
+	spin_unlock_irq(&port->lock);
+	tty_port_set_active(port, 0);
 
 	/*
 	 * Wake up anyone trying to open this port.
 	 */
-	clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
-	spin_unlock_irq(&port->lock);
 	wake_up_interruptible(&port->open_wait);
 
 	mutex_unlock(&port->mutex);
@@ -1501,13 +1501,13 @@ static void uart_hangup(struct tty_struct *tty)
 	pr_debug("uart_hangup(%d)\n", tty->index);
 
 	mutex_lock(&port->mutex);
-	if (port->flags & ASYNC_NORMAL_ACTIVE) {
+	if (tty_port_active(port)) {
 		uart_flush_buffer(tty);
 		uart_shutdown(tty, state);
 		spin_lock_irqsave(&port->lock, flags);
 		port->count = 0;
-		clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
 		spin_unlock_irqrestore(&port->lock, flags);
+		tty_port_set_active(port, 0);
 		tty_port_tty_set(port, NULL);
 		if (!uart_console(state->uart_port))
 			uart_change_pm(state, UART_PM_STATE_OFF);
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 0e4290183280..b55f8468cde5 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -3201,7 +3201,7 @@ static void mgsl_hangup(struct tty_struct *tty)
 	shutdown(info);
 	
 	info->port.count = 0;	
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->port, 0);
 	info->port.tty = NULL;
 
 	wake_up_interruptible(&info->port.open_wait);
@@ -3269,7 +3269,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 
 	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -3338,7 +3338,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 			 __FILE__,__LINE__, tty->driver->name, port->count );
 			 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		
 	return retval;
 	
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 5da69d30f816..c76f546697dc 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -756,9 +756,9 @@ static void hangup(struct tty_struct *tty)
 
 	spin_lock_irqsave(&info->port.lock, flags);
 	info->port.count = 0;
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	info->port.tty = NULL;
 	spin_unlock_irqrestore(&info->port.lock, flags);
+	tty_port_set_active(&info->port, 0);
 	mutex_unlock(&info->port.mutex);
 
 	wake_up_interruptible(&info->port.open_wait);
@@ -3268,7 +3268,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -3325,7 +3325,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open--;
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	DBGINFO(("%s block_til_ready ready, rc=%d\n", tty->driver->name, retval));
 	return retval;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 7a21491d0c0d..95eddc4d9eb8 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -849,9 +849,9 @@ static void hangup(struct tty_struct *tty)
 
 	spin_lock_irqsave(&info->port.lock, flags);
 	info->port.count = 0;
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	info->port.tty = NULL;
 	spin_unlock_irqrestore(&info->port.lock, flags);
+	tty_port_set_active(&info->port, 1);
 	mutex_unlock(&info->port.mutex);
 
 	wake_up_interruptible(&info->port.open_wait);
@@ -3285,7 +3285,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		/* just verify that callout device is not active */
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -3352,7 +3352,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 			 __FILE__,__LINE__, tty->driver->name, port->count );
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	return retval;
 }
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 9127c54b803e..130c8cf520cb 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -236,12 +236,12 @@ void tty_port_hangup(struct tty_port *port)
 
 	spin_lock_irqsave(&port->lock, flags);
 	port->count = 0;
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	tty = port->tty;
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 	port->tty = NULL;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 	tty_port_shutdown(port, tty);
 	tty_kref_put(tty);
 	wake_up_interruptible(&port->open_wait);
@@ -365,14 +365,14 @@ int tty_port_block_til_ready(struct tty_port *port,
 	/* if non-blocking mode is set we can pass directly to open unless
 	   the port has just hung up or is in another error state */
 	if (tty_io_error(tty)) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 	if (filp->f_flags & O_NONBLOCK) {
 		/* Indicate we are open */
 		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -430,9 +430,9 @@ int tty_port_block_til_ready(struct tty_port *port,
 	if (!tty_hung_up_p(filp))
 		port->count++;
 	port->blocked_open--;
-	if (retval == 0)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
 	spin_unlock_irqrestore(&port->lock, flags);
+	if (retval == 0)
+		tty_port_set_active(port, 1);
 	return retval;
 }
 EXPORT_SYMBOL(tty_port_block_til_ready);
@@ -514,8 +514,8 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 		spin_lock_irqsave(&port->lock, flags);
 		wake_up_interruptible(&port->open_wait);
 	}
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 }
 EXPORT_SYMBOL(tty_port_close_end);
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 989d755b0ae4..dbeeb8666ae4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -571,6 +571,18 @@ static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_CTS_FLOW, &port->iflags);
 }
 
+static inline bool tty_port_active(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_ACTIVE, &port->iflags);
+}
+
+static inline void tty_port_set_active(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_ACTIVE, &port->iflags);
+	else
+		clear_bit(TTY_PORT_ACTIVE, &port->iflags);
+}
 
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 840b82f760ba..681fe0bfe558 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -281,7 +281,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	 * then make the check up front and then exit.
 	 */
 	if (tty_io_error(tty)) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -289,7 +289,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 		/* nonblock mode is set */
 		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
 		return 0;
 	}
@@ -365,7 +365,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 		 __FILE__, __LINE__, tty->driver->name, port->count);
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	return retval;
 }
@@ -925,7 +925,6 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
 	ircomm_tty_shutdown(self);
 
 	spin_lock_irqsave(&port->lock, flags);
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	if (port->tty) {
 		set_bit(TTY_IO_ERROR, &port->tty->flags);
 		tty_kref_put(port->tty);
@@ -933,6 +932,7 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
 	port->tty = NULL;
 	port->count = 0;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 
 	wake_up_interruptible(&port->open_wait);
 }
@@ -1267,7 +1267,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
+	if (tty_port_active(&self->port)) {
 		seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
 		sep = '|';
 	}
-- 
cgit v1.2.3


From 2d68655d15bc99981394f7caa769a14b03cac131 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:23 -0700
Subject: tty: Replace ASYNC_CHECK_CD and update atomically

Replace ASYNC_CHECK_CD bit in the tty_port::flags field with
TTY_PORT_CHECK_CD bit in the tty_port::iflags field. Introduce helpers
tty_port_set_check_carrier() and tty_port_check_carrier() to abstract
the atomic bit ops.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c   |  8 ++------
 drivers/isdn/i4l/isdn_tty.c         |  8 ++------
 drivers/tty/amiserial.c             |  9 +++------
 drivers/tty/cyclades.c              | 14 ++++----------
 drivers/tty/isicom.c                |  7 ++-----
 drivers/tty/mxser.c                 |  9 +++------
 drivers/tty/synclink.c              |  8 ++------
 drivers/tty/synclink_gt.c           |  8 ++------
 drivers/tty/synclinkmp.c            |  8 ++------
 include/linux/tty.h                 | 13 +++++++++++++
 net/irda/ircomm/ircomm_tty.c        |  4 ++--
 net/irda/ircomm/ircomm_tty_attach.c |  2 +-
 net/irda/ircomm/ircomm_tty_ioctl.c  |  5 +----
 13 files changed, 39 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index bdf41ac613dc..bf54f4e23b6f 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1101,7 +1101,7 @@ static void dcd_change(MGSLPC_INFO *info, struct tty_struct *tty)
 	wake_up_interruptible(&info->status_event_wait_q);
 	wake_up_interruptible(&info->event_wait_q);
 
-	if (info->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&info->port)) {
 		if (debug_level >= DEBUG_LEVEL_ISR)
 			printk("%s CD now %s...", info->device_name,
 			       (info->serial_signals & SerialSignal_DCD) ? "on" : "off");
@@ -1467,11 +1467,7 @@ static void mgslpc_change_params(MGSLPC_INFO *info, struct tty_struct *tty)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index d8468f3529a7..023a350a8cd8 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1043,11 +1043,7 @@ isdn_tty_change_speed(modem_info *info)
 	if (!(cflag & PARODD))
 		cval |= UART_LCR_EPAR;
 
-	if (cflag & CLOCAL)
-		port->flags &= ~ASYNC_CHECK_CD;
-	else {
-		port->flags |= ASYNC_CHECK_CD;
-	}
+	tty_port_set_check_carrier(port, ~cflag & CLOCAL);
 }
 
 static int
@@ -2526,7 +2522,7 @@ isdn_tty_modem_result(int code, modem_info *info)
 		if (info->closing || (!info->port.tty))
 			return;
 
-		if (info->port.flags & ASYNC_CHECK_CD)
+		if (tty_port_check_carrier(&info->port))
 			tty_hangup(info->port.tty);
 	}
 }
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 80d61658efb0..b4ab97d56351 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -398,7 +398,7 @@ static void check_modem_status(struct serial_state *info)
 		wake_up_interruptible(&port->delta_msr_wait);
 	}
 
-	if ((port->flags & ASYNC_CHECK_CD) && (dstatus & SER_DCD)) {
+	if (tty_port_check_carrier(port) && (dstatus & SER_DCD)) {
 #if (defined(SERIAL_DEBUG_OPEN) || defined(SERIAL_DEBUG_INTR))
 		printk("ttyS%d CD now %s...", info->line,
 		       (!(status & SER_DCD)) ? "on" : "off");
@@ -730,12 +730,9 @@ static void change_speed(struct tty_struct *tty, struct serial_state *info,
 	tty_port_set_cts_flow(port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS)
 		info->IER |= UART_IER_MSI;
-	if (cflag & CLOCAL)
-		port->flags &= ~ASYNC_CHECK_CD;
-	else {
-		port->flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(port, ~cflag & CLOCAL);
+	if (~cflag & CLOCAL)
 		info->IER |= UART_IER_MSI;
-	}
 	/* TBD:
 	 * Does clearing IER_MSI imply that we should disable the VBL interrupt ?
 	 */
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 1a12776ba24c..9d1e19ba25cb 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -714,7 +714,7 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 		wake_up_interruptible(&info->port.delta_msr_wait);
 	}
 
-	if ((mdm_change & CyDCD) && (info->port.flags & ASYNC_CHECK_CD)) {
+	if ((mdm_change & CyDCD) && tty_port_check_carrier(&info->port)) {
 		if (mdm_status & CyDCD)
 			wake_up_interruptible(&info->port.open_wait);
 		else
@@ -1119,7 +1119,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 		case C_CM_MDCD:
 			info->icount.dcd++;
 			delta_count++;
-			if (info->port.flags & ASYNC_CHECK_CD) {
+			if (tty_port_check_carrier(&info->port)) {
 				u32 dcd = fw_ver > 241 ? param :
 					readl(&info->u.cyz.ch_ctrl->rs_status);
 				if (dcd & C_RS_DCD)
@@ -2088,10 +2088,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 			info->cor2 |= CyCtsAE;
 		else
 			info->cor2 &= ~CyCtsAE;
-		if (cflag & CLOCAL)
-			info->port.flags &= ~ASYNC_CHECK_CD;
-		else
-			info->port.flags |= ASYNC_CHECK_CD;
+		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	 /***********************************************
 	    The hardware option, CyRtsAO, presents RTS when
@@ -2250,10 +2247,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 		}
 
 		/* CD sensitivity */
-		if (cflag & CLOCAL)
-			info->port.flags &= ~ASYNC_CHECK_CD;
-		else
-			info->port.flags |= ASYNC_CHECK_CD;
+		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 		if (baud == 0) {	/* baud rate is zero, turn off line */
 			cy_writel(&ch_ctrl->rs_control,
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index c5f06b54b9ca..0b2bae1b2d55 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -577,7 +577,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id)
 		header = inw(base);
 		switch (header & 0xff) {
 		case 0:	/* Change in EIA signals */
-			if (port->port.flags & ASYNC_CHECK_CD) {
+			if (tty_port_check_carrier(&port->port)) {
 				if (port->status & ISI_DCD) {
 					if (!(header & ISI_DCD)) {
 					/* Carrier has been lost  */
@@ -758,10 +758,7 @@ static void isicom_config_port(struct tty_struct *tty)
 		outw(channel_setup, base);
 		InterruptTheCard(base);
 	}
-	if (C_CLOCAL(tty))
-		port->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		port->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&port->port, !C_CLOCAL(tty));
 
 	/* flow control settings ...*/
 	flow_ctrl = 0;
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 8f3fdad37ac7..ab618ef3d171 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -746,12 +746,9 @@ static int mxser_change_speed(struct tty_struct *tty,
 		}
 	}
 	outb(info->MCR, info->ioaddr + UART_MCR);
-	if (cflag & CLOCAL) {
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	} else {
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
+	if (~cflag & CLOCAL)
 		info->IER |= UART_IER_MSI;
-	}
 	outb(info->IER, info->ioaddr + UART_IER);
 
 	/*
@@ -824,7 +821,7 @@ static void mxser_check_modem_status(struct tty_struct *tty,
 	port->mon_data.modem_status = status;
 	wake_up_interruptible(&port->port.delta_msr_wait);
 
-	if ((port->port.flags & ASYNC_CHECK_CD) && (status & UART_MSR_DDCD)) {
+	if (tty_port_check_carrier(&port->port) && (status & UART_MSR_DDCD)) {
 		if (status & UART_MSR_DCD)
 			wake_up_interruptible(&port->port.open_wait);
 	}
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index b55f8468cde5..b67b54a800bb 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -1340,7 +1340,7 @@ static void mgsl_isr_io_pin( struct mgsl_struct *info )
 		wake_up_interruptible(&info->status_event_wait_q);
 		wake_up_interruptible(&info->event_wait_q);
 
-		if ( (info->port.flags & ASYNC_CHECK_CD) && 
+		if (tty_port_check_carrier(&info->port) &&
 		     (status & MISCSTATUS_DCD_LATCHED) ) {
 			if ( debug_level >= DEBUG_LEVEL_ISR )
 				printk("%s CD now %s...", info->device_name,
@@ -1967,11 +1967,7 @@ static void mgsl_change_params(struct mgsl_struct *info)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 	
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index c76f546697dc..333652a8896a 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -2080,7 +2080,7 @@ static void dcd_change(struct slgt_info *info, unsigned short status)
 	wake_up_interruptible(&info->event_wait_q);
 	info->pending_bh |= BH_STATUS;
 
-	if (info->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&info->port)) {
 		if (info->signals & SerialSignal_DCD)
 			wake_up_interruptible(&info->port.open_wait);
 		else {
@@ -2577,11 +2577,7 @@ static void change_params(struct slgt_info *info)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 95eddc4d9eb8..17bab5f5b858 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -2463,7 +2463,7 @@ static void isr_io_pin( SLMP_INFO *info, u16 status )
 		wake_up_interruptible(&info->status_event_wait_q);
 		wake_up_interruptible(&info->event_wait_q);
 
-		if ( (info->port.flags & ASYNC_CHECK_CD) &&
+		if (tty_port_check_carrier(&info->port) &&
 		     (status & MISCSTATUS_DCD_LATCHED) ) {
 			if ( debug_level >= DEBUG_LEVEL_ISR )
 				printk("%s CD now %s...", info->device_name,
@@ -2814,11 +2814,7 @@ static void change_params(SLMP_INFO *info)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index dbeeb8666ae4..4254dfb12fb1 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -584,6 +584,19 @@ static inline void tty_port_set_active(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_ACTIVE, &port->iflags);
 }
 
+static inline bool tty_port_check_carrier(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_CHECK_CD, &port->iflags);
+}
+
+static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_CHECK_CD, &port->iflags);
+	else
+		clear_bit(TTY_PORT_CHECK_CD, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 681fe0bfe558..5b7ce599c709 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -999,7 +999,7 @@ void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
 	if (status & IRCOMM_DCE_DELTA_ANY) {
 		/*wake_up_interruptible(&self->delta_msr_wait);*/
 	}
-	if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+	if (tty_port_check_carrier(&self->port) && (status & IRCOMM_DELTA_CD)) {
 		pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line,
 			 (status & IRCOMM_CD) ? "on" : "off");
 
@@ -1255,7 +1255,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_CTS_FLOW", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&self->port)) {
 		seq_printf(m, "%cASYNC_CHECK_CD", sep);
 		sep = '|';
 	}
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index 61137f8b5293..0a411019c098 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -968,7 +968,7 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
 		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
 		ircomm_tty_start_watchdog_timer(self, 3*HZ);
 
-		if (self->port.flags & ASYNC_CHECK_CD) {
+		if (tty_port_check_carrier(&self->port)) {
 			/* Drop carrier */
 			self->settings.dce = IRCOMM_DELTA_CD;
 			ircomm_tty_check_modem_status(self);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 1220973c7c43..e24724db36a2 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -96,10 +96,7 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
 	} else {
 		self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
 	}
-	if (cflag & CLOCAL)
-		self->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		self->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&self->port, ~cflag & CLOCAL);
 #if 0
 	/*
 	 * Set up parity check flag
-- 
cgit v1.2.3


From 80f02d5424301bf4df195d09b1a664f394435851 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:24 -0700
Subject: tty: Replace ASYNC_SUSPENDED bit and update atomically

Replace ASYNC_SUSPENDED bit in the tty_port::flags field with
TTY_PORT_SUSPENDED bit in the tty_port::iflags field. Introduce helpers
tty_port_set_suspended() and tty_port_suspended() to abstract
atomic bit ops.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/char/con3215.c      | 12 ++++++------
 drivers/tty/serial/serial_core.c |  8 ++++----
 include/linux/tty.h              | 13 +++++++++++++
 3 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index e7e078b3c7e6..114fe2845270 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -289,7 +289,7 @@ static void raw3215_timeout(unsigned long __data)
 
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 	raw->flags &= ~RAW3215_TIMER_RUNS;
-	if (!(raw->port.flags & ASYNC_SUSPENDED)) {
+	if (!tty_port_suspended(&raw->port)) {
 		raw3215_mk_write_req(raw);
 		raw3215_start_io(raw);
 		if ((raw->queued_read || raw->queued_write) &&
@@ -312,7 +312,7 @@ static void raw3215_timeout(unsigned long __data)
 static inline void raw3215_try_io(struct raw3215_info *raw)
 {
 	if (!(raw->port.flags & ASYNC_INITIALIZED) ||
-			(raw->port.flags & ASYNC_SUSPENDED))
+	    tty_port_suspended(&raw->port))
 		return;
 	if (raw->queued_read != NULL)
 		raw3215_start_io(raw);
@@ -494,7 +494,7 @@ static void raw3215_make_room(struct raw3215_info *raw, unsigned int length)
 		/* While console is frozen for suspend we have no other
 		 * choice but to drop message from the buffer to make
 		 * room for even more messages. */
-		if (raw->port.flags & ASYNC_SUSPENDED) {
+		if (tty_port_suspended(&raw->port)) {
 			raw3215_drop_line(raw);
 			continue;
 		}
@@ -773,7 +773,7 @@ static int raw3215_pm_stop(struct ccw_device *cdev)
 	raw = dev_get_drvdata(&cdev->dev);
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 	raw3215_make_room(raw, RAW3215_BUFFER_SIZE);
-	raw->port.flags |= ASYNC_SUSPENDED;
+	tty_port_set_suspended(&raw->port, 1);
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
 	return 0;
 }
@@ -786,7 +786,7 @@ static int raw3215_pm_start(struct ccw_device *cdev)
 	/* Allow I/O again and flush output buffer. */
 	raw = dev_get_drvdata(&cdev->dev);
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
-	raw->port.flags &= ~ASYNC_SUSPENDED;
+	tty_port_set_suspended(&raw->port, 0);
 	raw->flags |= RAW3215_FLUSHING;
 	raw3215_try_io(raw);
 	raw->flags &= ~RAW3215_FLUSHING;
@@ -859,7 +859,7 @@ static void con3215_flush(void)
 	unsigned long flags;
 
 	raw = raw3215[0];  /* console 3215 is the first one */
-	if (raw->port.flags & ASYNC_SUSPENDED)
+	if (tty_port_suspended(&raw->port))
 		/* The console is still frozen for suspend. */
 		if (ccw_device_force_console(raw->cdev))
 			/* Forcing didn't work, no panic message .. */
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2471380fb92e..933606777f45 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -249,7 +249,7 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 	 * a DCD drop (hangup) at just the right time.  Clear suspended bit so
 	 * we don't try to resume a port that has been shutdown.
 	 */
-	clear_bit(ASYNCB_SUSPENDED, &port->flags);
+	tty_port_set_suspended(port, 0);
 
 	/*
 	 * Free the transmit buffer page.
@@ -2007,7 +2007,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 		const struct uart_ops *ops = uport->ops;
 		int tries;
 
-		set_bit(ASYNCB_SUSPENDED, &port->flags);
+		tty_port_set_suspended(port, 1);
 		clear_bit(ASYNCB_INITIALIZED, &port->flags);
 
 		spin_lock_irq(&uport->lock);
@@ -2088,7 +2088,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 			console_start(uport->cons);
 	}
 
-	if (port->flags & ASYNC_SUSPENDED) {
+	if (tty_port_suspended(port)) {
 		const struct uart_ops *ops = uport->ops;
 		int ret;
 
@@ -2118,7 +2118,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 			}
 		}
 
-		clear_bit(ASYNCB_SUSPENDED, &port->flags);
+		tty_port_set_suspended(port, 0);
 	}
 
 	mutex_unlock(&port->mutex);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4254dfb12fb1..7ac5add66c00 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -597,6 +597,19 @@ static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_CHECK_CD, &port->iflags);
 }
 
+static inline bool tty_port_suspended(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_SUSPENDED, &port->iflags);
+}
+
+static inline void tty_port_set_suspended(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_SUSPENDED, &port->iflags);
+	else
+		clear_bit(TTY_PORT_SUSPENDED, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
-- 
cgit v1.2.3


From d41861ca19c9e96f12a4f1ebbc8255d00909a232 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:25 -0700
Subject: tty: Replace ASYNC_INITIALIZED bit and update atomically

Replace ASYNC_INITIALIZED bit in the tty_port::flags field with
TTY_PORT_INITIALIZED bit in the tty_port::iflags field. Introduce helpers
tty_port_set_initialized() and tty_port_initialized() to abstract
atomic bit ops.

Note: the transforms for test_and_set_bit() and test_and_clear_bit()
are unnecessary as the state transitions are already mutually exclusive;
the tty lock prevents concurrent open/close/hangup.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c  | 12 +++++-----
 drivers/ipack/devices/ipoctal.c    |  5 ++---
 drivers/isdn/i4l/isdn_tty.c        | 10 ++++-----
 drivers/s390/char/con3215.c        | 12 +++++-----
 drivers/tty/amiserial.c            | 14 ++++++------
 drivers/tty/cyclades.c             | 14 ++++++------
 drivers/tty/isicom.c               |  6 ++---
 drivers/tty/moxa.c                 | 10 ++++-----
 drivers/tty/mxser.c                | 14 +++++-------
 drivers/tty/n_gsm.c                |  8 +++----
 drivers/tty/rocket.c               | 10 ++++-----
 drivers/tty/serial/crisv10.c       | 17 +++++++-------
 drivers/tty/serial/serial_core.c   | 24 +++++++++++---------
 drivers/tty/synclink.c             | 46 ++++++++++++++++++--------------------
 drivers/tty/synclink_gt.c          | 16 ++++++-------
 drivers/tty/synclinkmp.c           | 16 ++++++-------
 drivers/tty/tty_port.c             | 13 ++++++-----
 drivers/usb/class/cdc-acm.c        |  4 ++--
 drivers/usb/serial/console.c       |  4 ++--
 drivers/usb/serial/generic.c       |  6 ++---
 drivers/usb/serial/mxuport.c       |  6 ++---
 drivers/usb/serial/sierra.c        |  4 ++--
 drivers/usb/serial/usb-serial.c    |  2 +-
 drivers/usb/serial/usb_wwan.c      |  4 ++--
 include/linux/tty.h                | 13 +++++++++++
 net/irda/ircomm/ircomm_tty.c       | 15 +++++++------
 net/irda/ircomm/ircomm_tty_ioctl.c |  2 +-
 27 files changed, 157 insertions(+), 150 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index bf54f4e23b6f..345ca7c7ea74 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1272,7 +1272,7 @@ static int startup(MGSLPC_INFO * info, struct tty_struct *tty)
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):startup(%s)\n", __FILE__, __LINE__, info->device_name);
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 
 	if (!info->tx_buf) {
@@ -1311,7 +1311,7 @@ static int startup(MGSLPC_INFO * info, struct tty_struct *tty)
 	if (tty)
 		clear_bit(TTY_IO_ERROR, &tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	return 0;
 }
@@ -1322,7 +1322,7 @@ static void shutdown(MGSLPC_INFO * info, struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
@@ -1361,7 +1361,7 @@ static void shutdown(MGSLPC_INFO * info, struct tty_struct *tty)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 static void mgslpc_program_hw(MGSLPC_INFO *info, struct tty_struct *tty)
@@ -2338,7 +2338,7 @@ static void mgslpc_close(struct tty_struct *tty, struct file * filp)
 	if (tty_port_close_start(port, tty, filp) == 0)
 		goto cleanup;
 
-	if (port->flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(port))
 		mgslpc_wait_until_sent(tty, info->timeout);
 
 	mgslpc_flush_buffer(tty);
@@ -2371,7 +2371,7 @@ static void mgslpc_wait_until_sent(struct tty_struct *tty, int timeout)
 	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_wait_until_sent"))
 		return;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
 
 	orig_jiffies = jiffies;
diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c
index 035d5449227e..75dd15d66df6 100644
--- a/drivers/ipack/devices/ipoctal.c
+++ b/drivers/ipack/devices/ipoctal.c
@@ -629,8 +629,7 @@ static void ipoctal_hangup(struct tty_struct *tty)
 	tty_port_hangup(&channel->tty_port);
 
 	ipoctal_reset_channel(channel);
-
-	clear_bit(ASYNCB_INITIALIZED, &channel->tty_port.flags);
+	tty_port_set_initialized(&channel->tty_port, 0);
 	wake_up_interruptible(&channel->tty_port.open_wait);
 }
 
@@ -642,7 +641,7 @@ static void ipoctal_shutdown(struct tty_struct *tty)
 		return;
 
 	ipoctal_reset_channel(channel);
-	clear_bit(ASYNCB_INITIALIZED, &channel->tty_port.flags);
+	tty_port_set_initialized(&channel->tty_port, 0);
 }
 
 static void ipoctal_cleanup(struct tty_struct *tty)
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 023a350a8cd8..63eaa0a9f8a1 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1049,7 +1049,7 @@ isdn_tty_change_speed(modem_info *info)
 static int
 isdn_tty_startup(modem_info *info)
 {
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 	isdn_lock_drivers();
 #ifdef ISDN_DEBUG_MODEM_OPEN
@@ -1066,7 +1066,7 @@ isdn_tty_startup(modem_info *info)
 	 */
 	isdn_tty_change_speed(info);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 	info->msr |= (UART_MSR_DSR | UART_MSR_CTS);
 	info->send_outstanding = 0;
 	return 0;
@@ -1079,7 +1079,7 @@ isdn_tty_startup(modem_info *info)
 static void
 isdn_tty_shutdown(modem_info *info)
 {
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 #ifdef ISDN_DEBUG_MODEM_OPEN
 	printk(KERN_DEBUG "Shutting down isdnmodem port %d ....\n", info->line);
@@ -1099,7 +1099,7 @@ isdn_tty_shutdown(modem_info *info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 /* isdn_tty_write() is the main send-routine. It is called from the upper
@@ -1577,7 +1577,7 @@ isdn_tty_close(struct tty_struct *tty, struct file *filp)
 	 * interrupt driver to stop checking the data ready bit in the
 	 * line status register.
 	 */
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		tty_wait_until_sent(tty, 3000);	/* 30 seconds timeout */
 		/*
 		 * Before we drop DTR, make sure the UART transmitter
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 114fe2845270..931d10e86837 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -311,8 +311,7 @@ static void raw3215_timeout(unsigned long __data)
  */
 static inline void raw3215_try_io(struct raw3215_info *raw)
 {
-	if (!(raw->port.flags & ASYNC_INITIALIZED) ||
-	    tty_port_suspended(&raw->port))
+	if (!tty_port_initialized(&raw->port) || tty_port_suspended(&raw->port))
 		return;
 	if (raw->queued_read != NULL)
 		raw3215_start_io(raw);
@@ -616,10 +615,10 @@ static int raw3215_startup(struct raw3215_info *raw)
 {
 	unsigned long flags;
 
-	if (raw->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&raw->port))
 		return 0;
 	raw->line_pos = 0;
-	raw->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&raw->port, 1);
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 	raw3215_try_io(raw);
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
@@ -635,8 +634,7 @@ static void raw3215_shutdown(struct raw3215_info *raw)
 	DECLARE_WAITQUEUE(wait, current);
 	unsigned long flags;
 
-	if (!(raw->port.flags & ASYNC_INITIALIZED) ||
-	    (raw->flags & RAW3215_FIXED))
+	if (!tty_port_initialized(&raw->port) || (raw->flags & RAW3215_FIXED))
 		return;
 	/* Wait for outstanding requests, then free irq */
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
@@ -650,7 +648,7 @@ static void raw3215_shutdown(struct raw3215_info *raw)
 		spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 		remove_wait_queue(&raw->empty_wait, &wait);
 		set_current_state(TASK_RUNNING);
-		raw->port.flags &= ~ASYNC_INITIALIZED;
+		tty_port_set_initialized(&raw->port, 1);
 	}
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
 }
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index b4ab97d56351..208f573495dc 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -525,7 +525,7 @@ static int startup(struct tty_struct *tty, struct serial_state *info)
 
 	local_irq_save(flags);
 
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		free_page(page);
 		goto errout;
 	}
@@ -586,7 +586,7 @@ static int startup(struct tty_struct *tty, struct serial_state *info)
 	 */
 	change_speed(tty, info, NULL);
 
-	port->flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(port, 1);
 	local_irq_restore(flags);
 	return 0;
 
@@ -604,7 +604,7 @@ static void shutdown(struct tty_struct *tty, struct serial_state *info)
 	unsigned long	flags;
 	struct serial_state *state;
 
-	if (!(info->tport.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->tport))
 		return;
 
 	state = info;
@@ -645,7 +645,7 @@ static void shutdown(struct tty_struct *tty, struct serial_state *info)
 
 	set_bit(TTY_IO_ERROR, &tty->flags);
 
-	info->tport.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->tport, 0);
 	local_irq_restore(flags);
 }
 
@@ -1084,7 +1084,7 @@ static int set_serial_info(struct tty_struct *tty, struct serial_state *state,
 	port->low_latency = (port->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
 check_and_exit:
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		if (change_spd) {
 			if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI)
 				tty->alt_speed = 57600;
@@ -1390,7 +1390,7 @@ static void rs_close(struct tty_struct *tty, struct file * filp)
 	 * line status register.
 	 */
 	state->read_status_mask &= ~UART_LSR_DR;
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 	        /* disable receive interrupts */
 	        custom.intena = IF_RBF;
 		mb();
@@ -1538,7 +1538,7 @@ static inline void line_info(struct seq_file *m, int line,
 
 	local_irq_save(flags);
 	status = ciab.pra;
-	control = (state->tport.flags & ASYNC_INITIALIZED) ? state->MCR : status;
+	control = tty_port_initialized(&state->tport) ? state->MCR : status;
 	local_irq_restore(flags);
 
 	stat_buf[0] = 0;
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 9d1e19ba25cb..3840d6b421c4 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -1279,7 +1279,7 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 
 	spin_lock_irqsave(&card->card_lock, flags);
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		goto errout;
 
 	if (!info->type) {
@@ -1364,7 +1364,7 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 		/* enable send, recv, modem !!! */
 	}
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	clear_bit(TTY_IO_ERROR, &tty->flags);
 	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
@@ -1424,7 +1424,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 	struct cyclades_card *card;
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	card = info->card;
@@ -1448,7 +1448,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 		   some later date (after testing)!!! */
 
 		set_bit(TTY_IO_ERROR, &tty->flags);
-		info->port.flags &= ~ASYNC_INITIALIZED;
+		tty_port_set_initialized(&info->port, 0);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	} else {
 #ifdef CY_DEBUG_OPEN
@@ -1473,7 +1473,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 			tty_port_lower_dtr_rts(&info->port);
 
 		set_bit(TTY_IO_ERROR, &tty->flags);
-		info->port.flags &= ~ASYNC_INITIALIZED;
+		tty_port_set_initialized(&info->port, 0);
 
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	}
@@ -1711,7 +1711,7 @@ static void cy_do_close(struct tty_port *port)
 		/* Stop accepting input */
 		cyy_writeb(info, CyCAR, channel & 0x03);
 		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyRxData);
-		if (info->port.flags & ASYNC_INITIALIZED) {
+		if (tty_port_initialized(&info->port)) {
 			/* Waiting for on-board buffers to be empty before
 			   closing the port */
 			spin_unlock_irqrestore(&card->card_lock, flags);
@@ -2334,7 +2334,7 @@ cy_set_serial_info(struct cyclades_port *info, struct tty_struct *tty,
 	info->port.closing_wait = new_serial.closing_wait * HZ / 100;
 
 check_and_exit:
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port)) {
 		cy_set_line_char(info, tty);
 		ret = 0;
 	} else {
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 0b2bae1b2d55..b70187b46d9d 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -438,8 +438,8 @@ static void isicom_tx(unsigned long _data)
 
 	for (; count > 0; count--, port++) {
 		/* port not active or tx disabled to force flow control */
-		if (!(port->port.flags & ASYNC_INITIALIZED) ||
-				!(port->status & ISI_TXOK))
+		if (!tty_port_initialized(&port->port) ||
+			!(port->status & ISI_TXOK))
 			continue;
 
 		txcount = min_t(short, TX_SIZE, port->xmit_cnt);
@@ -553,7 +553,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id)
 		return IRQ_HANDLED;
 	}
 	port = card->ports + channel;
-	if (!(port->port.flags & ASYNC_INITIALIZED)) {
+	if (!tty_port_initialized(&port->port)) {
 		outw(0x0000, base+0x04); /* enable interrupts */
 		spin_unlock(&card->card_lock);
 		return IRQ_HANDLED;
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index ce521d3f58cb..60d37b225589 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -912,7 +912,7 @@ static void moxa_board_deinit(struct moxa_board_conf *brd)
 
 	/* pci hot-un-plug support */
 	for (a = 0; a < brd->numPorts; a++)
-		if (brd->ports[a].port.flags & ASYNC_INITIALIZED)
+		if (tty_port_initialized(&brd->ports[a].port))
 			tty_port_tty_hangup(&brd->ports[a].port, false);
 
 	for (a = 0; a < MAX_PORTS_PER_BOARD; a++)
@@ -921,7 +921,7 @@ static void moxa_board_deinit(struct moxa_board_conf *brd)
 	while (1) {
 		opened = 0;
 		for (a = 0; a < brd->numPorts; a++)
-			if (brd->ports[a].port.flags & ASYNC_INITIALIZED)
+			if (tty_port_initialized(&brd->ports[a].port))
 				opened++;
 		mutex_unlock(&moxa_openlock);
 		if (!opened)
@@ -1192,13 +1192,13 @@ static int moxa_open(struct tty_struct *tty, struct file *filp)
 	tty->driver_data = ch;
 	tty_port_tty_set(&ch->port, tty);
 	mutex_lock(&ch->port.mutex);
-	if (!(ch->port.flags & ASYNC_INITIALIZED)) {
+	if (!tty_port_initialized(&ch->port)) {
 		ch->statusflags = 0;
 		moxa_set_tty_param(tty, &tty->termios);
 		MoxaPortLineCtrl(ch, 1, 1);
 		MoxaPortEnable(ch);
 		MoxaSetFifo(ch, ch->type == PORT_16550A);
-		ch->port.flags |= ASYNC_INITIALIZED;
+		tty_port_set_initialized(&ch->port, 1);
 	}
 	mutex_unlock(&ch->port.mutex);
 	mutex_unlock(&moxa_openlock);
@@ -1379,7 +1379,7 @@ static int moxa_poll_port(struct moxa_port *p, unsigned int handle,
 {
 	struct tty_struct *tty = tty_port_tty_get(&p->port);
 	void __iomem *ofsAddr;
-	unsigned int inited = p->port.flags & ASYNC_INITIALIZED;
+	unsigned int inited = tty_port_initialized(&p->port);
 	u16 intr;
 
 	if (tty) {
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index ab618ef3d171..7e8c27bf1ac8 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1081,12 +1081,10 @@ static void mxser_close(struct tty_struct *tty, struct file *filp)
 	mutex_lock(&port->mutex);
 	mxser_close_port(port);
 	mxser_flush_buffer(tty);
-	if (test_bit(ASYNCB_INITIALIZED, &port->flags)) {
-		if (C_HUPCL(tty))
-			tty_port_lower_dtr_rts(port);
-	}
+	if (tty_port_initialized(port) && C_HUPCL(tty))
+		tty_port_lower_dtr_rts(port);
 	mxser_shutdown_port(port);
-	clear_bit(ASYNCB_INITIALIZED, &port->flags);
+	tty_port_set_initialized(port, 0);
 	mutex_unlock(&port->mutex);
 	info->closing = 0;
 	/* Right now the tty_port set is done outside of the close_end helper
@@ -1282,7 +1280,7 @@ static int mxser_set_serial_info(struct tty_struct *tty,
 
 	process_txrx_fifo(info);
 
-	if (test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
 		if (flags != (port->flags & ASYNC_SPD_MASK)) {
 			spin_lock_irqsave(&info->slock, sl_flags);
 			mxser_change_speed(tty, NULL);
@@ -1291,7 +1289,7 @@ static int mxser_set_serial_info(struct tty_struct *tty,
 	} else {
 		retval = mxser_activate(port, tty);
 		if (retval == 0)
-			set_bit(ASYNCB_INITIALIZED, &port->flags);
+			tty_port_set_initialized(port, 1);
 	}
 	return retval;
 }
@@ -2251,7 +2249,7 @@ static irqreturn_t mxser_interrupt(int irq, void *dev_id)
 				iir &= MOXA_MUST_IIR_MASK;
 				tty = tty_port_tty_get(&port->port);
 				if (!tty || port->closing ||
-				    !(port->port.flags & ASYNC_INITIALIZED)) {
+				    !tty_port_initialized(&port->port)) {
 					status = inb(port->ioaddr + UART_LSR);
 					outb(0x27, port->ioaddr + UART_FCR);
 					inb(port->ioaddr + UART_MSR);
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 365dfd8bc42b..9f7a7bbff57d 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2949,7 +2949,7 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp)
 	dlci->modem_rx = 0;
 	/* We could in theory open and close before we wait - eg if we get
 	   a DM straight back. This is ok as that will have caused a hangup */
-	set_bit(ASYNCB_INITIALIZED, &port->flags);
+	tty_port_set_initialized(port, 1);
 	/* Start sending off SABM messages */
 	gsm_dlci_begin_open(dlci);
 	/* And wait for virtual carrier */
@@ -2972,10 +2972,8 @@ static void gsmtty_close(struct tty_struct *tty, struct file *filp)
 	if (tty_port_close_start(&dlci->port, tty, filp) == 0)
 		return;
 	gsm_dlci_begin_close(dlci);
-	if (test_bit(ASYNCB_INITIALIZED, &dlci->port.flags)) {
-		if (C_HUPCL(tty))
-			tty_port_lower_dtr_rts(&dlci->port);
-	}
+	if (tty_port_initialized(&dlci->port) && C_HUPCL(tty))
+		tty_port_lower_dtr_rts(&dlci->port);
 	tty_port_close_end(&dlci->port, tty);
 	tty_port_tty_set(&dlci->port, NULL);
 	return;
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index eb8311b20782..7f3b1db88061 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -495,7 +495,7 @@ static void rp_handle_port(struct r_port *info)
 	if (!info)
 		return;
 
-	if ((info->port.flags & ASYNC_INITIALIZED) == 0) {
+	if (!tty_port_initialized(&info->port)) {
 		printk(KERN_WARNING "rp: WARNING: rp_handle_port called with "
 				"info->flags & NOT_INIT\n");
 		return;
@@ -920,7 +920,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (!tty_port_initialized(port)) {
 		cp = &info->channel;
 		sSetRxTrigger(cp, TRIG_1);
 		if (sGetChanStatus(cp) & CD_ACT)
@@ -944,7 +944,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 		sEnRxFIFO(cp);
 		sEnTransmit(cp);
 
-		set_bit(ASYNCB_INITIALIZED, &info->port.flags);
+		tty_port_set_initialized(&info->port, 1);
 
 		/*
 		 * Set up the tty->alt_speed kludge
@@ -1042,9 +1042,9 @@ static void rp_close(struct tty_struct *tty, struct file *filp)
 		}
 	}
 	spin_lock_irq(&port->lock);
-	port->flags &= ~ASYNC_INITIALIZED;
 	tty->closing = 0;
 	spin_unlock_irq(&port->lock);
+	tty_port_set_initialized(port, 0);
 	tty_port_set_active(port, 0);
 	mutex_unlock(&port->mutex);
 	tty_port_tty_set(port, NULL);
@@ -1513,7 +1513,7 @@ static void rp_hangup(struct tty_struct *tty)
 	sDisCTSFlowCtl(cp);
 	sDisTxSoftFlowCtl(cp);
 	sClrTxXOFF(cp);
-	clear_bit(ASYNCB_INITIALIZED, &info->port.flags);
+	tty_port_set_initialized(&info->port, 0);
 
 	wake_up_interruptible(&info->port.open_wait);
 }
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 92c8c628e00e..315c84979b18 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -2599,7 +2599,7 @@ startup(struct e100_serial * info)
 
 	/* if it was already initialized, skip this */
 
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port)) {
 		local_irq_restore(flags);
 		free_page(xmit_page);
 		return 0;
@@ -2703,7 +2703,7 @@ startup(struct e100_serial * info)
 	e100_rts(info, 1);
 	e100_dtr(info, 1);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	local_irq_restore(flags);
 	return 0;
@@ -2745,7 +2745,7 @@ shutdown(struct e100_serial * info)
 		info->tr_running = 0;
 	}
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 #ifdef SERIAL_DEBUG_OPEN
@@ -2776,7 +2776,7 @@ shutdown(struct e100_serial * info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 	local_irq_restore(flags);
 }
 
@@ -3273,9 +3273,9 @@ set_serial_info(struct e100_serial *info,
 	info->port.low_latency = (info->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
  check_and_exit:
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port))
 		change_speed(info);
-	} else
+	else
 		retval = startup(info);
 	return retval;
 }
@@ -3628,7 +3628,7 @@ rs_close(struct tty_struct *tty, struct file * filp)
 	e100_disable_rx(info);
 	e100_disable_rx_irq(info);
 
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port)) {
 		/*
 		 * Before we drop DTR, make sure the UART transmitter
 		 * has completely drained; this is especially
@@ -3787,8 +3787,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 		e100_dtr(info, 1);
 		local_irq_restore(flags);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) ||
-		    !(info->port.flags & ASYNC_INITIALIZED)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(&info->port)) {
 #ifdef SERIAL_DO_RESTART
 			if (info->port.flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 933606777f45..0c48051db172 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -196,7 +196,7 @@ static int uart_startup(struct tty_struct *tty, struct uart_state *state,
 	struct tty_port *port = &state->port;
 	int retval;
 
-	if (port->flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(port))
 		return 0;
 
 	/*
@@ -207,7 +207,7 @@ static int uart_startup(struct tty_struct *tty, struct uart_state *state,
 
 	retval = uart_port_startup(tty, state, init_hw);
 	if (!retval) {
-		set_bit(ASYNCB_INITIALIZED, &port->flags);
+		tty_port_set_initialized(port, 1);
 		clear_bit(TTY_IO_ERROR, &tty->flags);
 	} else if (retval > 0)
 		retval = 0;
@@ -231,7 +231,9 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	if (test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
+		tty_port_set_initialized(port, 0);
+
 		/*
 		 * Turn off DTR and RTS early.
 		 */
@@ -886,7 +888,7 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
 	retval = 0;
 	if (uport->type == PORT_UNKNOWN)
 		goto exit;
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		if (((old_flags ^ uport->flags) & UPF_SPD_MASK) ||
 		    old_custom_divisor != uport->custom_divisor) {
 			/*
@@ -1390,7 +1392,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 * At this point, we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts.
 	 */
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		spin_lock_irq(&uport->lock);
 		uport->ops->stop_rx(uport);
 		spin_unlock_irq(&uport->lock);
@@ -2003,12 +2005,12 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 
 	uport->suspended = 1;
 
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		const struct uart_ops *ops = uport->ops;
 		int tries;
 
 		tty_port_set_suspended(port, 1);
-		clear_bit(ASYNCB_INITIALIZED, &port->flags);
+		tty_port_set_initialized(port, 0);
 
 		spin_lock_irq(&uport->lock);
 		ops->stop_tx(uport);
@@ -2107,7 +2109,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 				ops->set_mctrl(uport, uport->mctrl);
 				ops->start_tx(uport);
 				spin_unlock_irq(&uport->lock);
-				set_bit(ASYNCB_INITIALIZED, &port->flags);
+				tty_port_set_initialized(port, 1);
 			} else {
 				/*
 				 * Failed to resume - maybe hardware went away?
@@ -2248,10 +2250,10 @@ static int uart_poll_init(struct tty_driver *driver, int line, char *options)
 		ret = 0;
 		mutex_lock(&tport->mutex);
 		/*
-		 * We don't set ASYNCB_INITIALIZED as we only initialized the
-		 * hw, e.g. state->xmit is still uninitialized.
+		 * We don't set initialized as we only initialized the hw,
+		 * e.g. state->xmit is still uninitialized.
 		 */
-		if (!test_bit(ASYNCB_INITIALIZED, &tport->flags))
+		if (!tty_port_initialized(tport))
 			ret = port->ops->poll_init(port);
 		mutex_unlock(&tport->mutex);
 		if (ret)
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index b67b54a800bb..bc4bc1ff775e 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -1749,13 +1749,13 @@ static irqreturn_t mgsl_interrupt(int dummy, void *dev_id)
 static int startup(struct mgsl_struct * info)
 {
 	int retval = 0;
-	
+
 	if ( debug_level >= DEBUG_LEVEL_INFO )
 		printk("%s(%d):mgsl_startup(%s)\n",__FILE__,__LINE__,info->device_name);
-		
-	if (info->port.flags & ASYNC_INITIALIZED)
+
+	if (tty_port_initialized(&info->port))
 		return 0;
-	
+
 	if (!info->xmit_buf) {
 		/* allocate a page of memory for a transmit buffer */
 		info->xmit_buf = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -1788,14 +1788,13 @@ static int startup(struct mgsl_struct * info)
 
 	/* program hardware for current parameters */
 	mgsl_change_params(info);
-	
+
 	if (info->port.tty)
 		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
-	
+	tty_port_set_initialized(&info->port, 1);
+
 	return 0;
-	
 }	/* end of startup() */
 
 /* shutdown()
@@ -1808,8 +1807,8 @@ static int startup(struct mgsl_struct * info)
 static void shutdown(struct mgsl_struct * info)
 {
 	unsigned long flags;
-	
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
@@ -1853,13 +1852,12 @@ static void shutdown(struct mgsl_struct * info)
 
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
 
-	mgsl_release_resources(info);	
-	
+	mgsl_release_resources(info);
+
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
-	
+	tty_port_set_initialized(&info->port, 0);
 }	/* end of shutdown() */
 
 static void mgsl_program_hw(struct mgsl_struct *info)
@@ -3084,7 +3082,7 @@ static void mgsl_close(struct tty_struct *tty, struct file * filp)
 		goto cleanup;
 
 	mutex_lock(&info->port.mutex);
- 	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
  		mgsl_wait_until_sent(tty, info->timeout);
 	mgsl_flush_buffer(tty);
 	tty_ldisc_flush(tty);
@@ -3122,15 +3120,15 @@ static void mgsl_wait_until_sent(struct tty_struct *tty, int timeout)
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):mgsl_wait_until_sent(%s) entry\n",
 			 __FILE__,__LINE__, info->device_name );
-      
+
 	if (mgsl_paranoia_check(info, tty->name, "mgsl_wait_until_sent"))
 		return;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
-	 
+
 	orig_jiffies = jiffies;
-      
+
 	/* Set check interval to 1/5 of estimated time to
 	 * send a character, and make it at least 1. The check
 	 * interval should also be less than the timeout.
@@ -3290,14 +3288,14 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 	port->count--;
 	spin_unlock_irqrestore(&info->irq_spinlock, flags);
 	port->blocked_open++;
-	
+
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
-		
+
 		set_current_state(TASK_INTERRUPTIBLE);
-		
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 333652a8896a..82c98b820335 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -726,7 +726,7 @@ static void close(struct tty_struct *tty, struct file *filp)
 		goto cleanup;
 
 	mutex_lock(&info->port.mutex);
- 	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
  		wait_until_sent(tty, info->timeout);
 	flush_buffer(tty);
 	tty_ldisc_flush(tty);
@@ -893,7 +893,7 @@ static void wait_until_sent(struct tty_struct *tty, int timeout)
 	if (sanity_check(info, tty->name, "wait_until_sent"))
 		return;
 	DBGINFO(("%s wait_until_sent entry\n", info->device_name));
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
 
 	orig_jiffies = jiffies;
@@ -2421,7 +2421,7 @@ static int startup(struct slgt_info *info)
 {
 	DBGINFO(("%s startup\n", info->device_name));
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 
 	if (!info->tx_buf) {
@@ -2442,7 +2442,7 @@ static int startup(struct slgt_info *info)
 	if (info->port.tty)
 		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	return 0;
 }
@@ -2454,7 +2454,7 @@ static void shutdown(struct slgt_info *info)
 {
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	DBGINFO(("%s shutdown\n", info->device_name));
@@ -2489,7 +2489,7 @@ static void shutdown(struct slgt_info *info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 static void program_hw(struct slgt_info *info)
@@ -3287,12 +3287,12 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open++;
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 17bab5f5b858..6dcfc2089373 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -812,7 +812,7 @@ static void close(struct tty_struct *tty, struct file *filp)
 		goto cleanup;
 
 	mutex_lock(&info->port.mutex);
- 	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
  		wait_until_sent(tty, info->timeout);
 
 	flush_buffer(tty);
@@ -1061,7 +1061,7 @@ static void wait_until_sent(struct tty_struct *tty, int timeout)
 	if (sanity_check(info, tty->name, "wait_until_sent"))
 		return;
 
-	if (!test_bit(ASYNCB_INITIALIZED, &info->port.flags))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
 
 	orig_jiffies = jiffies;
@@ -2636,7 +2636,7 @@ static int startup(SLMP_INFO * info)
 	if ( debug_level >= DEBUG_LEVEL_INFO )
 		printk("%s(%d):%s tx_releaseup()\n",__FILE__,__LINE__,info->device_name);
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 
 	if (!info->tx_buf) {
@@ -2662,7 +2662,7 @@ static int startup(SLMP_INFO * info)
 	if (info->port.tty)
 		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	return 0;
 }
@@ -2673,7 +2673,7 @@ static void shutdown(SLMP_INFO * info)
 {
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
@@ -2705,7 +2705,7 @@ static void shutdown(SLMP_INFO * info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 static void program_hw(SLMP_INFO *info)
@@ -3308,12 +3308,12 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open++;
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 130c8cf520cb..c3f9d93ba227 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -204,7 +204,8 @@ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty)
 	if (port->console)
 		goto out;
 
-	if (test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
+		tty_port_set_initialized(port, 0);
 		/*
 		 * Drop DTR/RTS if HUPCL is set. This causes any attached
 		 * modem to hang up the line.
@@ -393,13 +394,13 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	while (1) {
 		/* Indicate we are open */
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
 		/* Check for a hangup or uninitialised port.
 							Return accordingly */
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
@@ -480,7 +481,7 @@ int tty_port_close_start(struct tty_port *port,
 
 	tty->closing = 1;
 
-	if (test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
 		/* Don't block on a stalled port, just pull the chain */
 		if (tty->flow_stopped)
 			tty_driver_flush_buffer(tty);
@@ -578,7 +579,7 @@ int tty_port_open(struct tty_port *port, struct tty_struct *tty,
 
 	mutex_lock(&port->mutex);
 
-	if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (!tty_port_initialized(port)) {
 		clear_bit(TTY_IO_ERROR, &tty->flags);
 		if (port->ops->activate) {
 			int retval = port->ops->activate(port, tty);
@@ -587,7 +588,7 @@ int tty_port_open(struct tty_port *port, struct tty_struct *tty,
 				return retval;
 			}
 		}
-		set_bit(ASYNCB_INITIALIZED, &port->flags);
+		tty_port_set_initialized(port, 1);
 	}
 	mutex_unlock(&port->mutex);
 	return tty_port_block_til_ready(port, tty, filp);
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index a6c4a1b895bd..94a14f5dc4d4 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1680,7 +1680,7 @@ static int acm_resume(struct usb_interface *intf)
 	if (--acm->susp_count)
 		goto out;
 
-	if (test_bit(ASYNCB_INITIALIZED, &acm->port.flags)) {
+	if (tty_port_initialized(&acm->port)) {
 		rv = usb_submit_urb(acm->ctrlurb, GFP_ATOMIC);
 
 		for (;;) {
@@ -1710,7 +1710,7 @@ static int acm_reset_resume(struct usb_interface *intf)
 {
 	struct acm *acm = usb_get_intfdata(intf);
 
-	if (test_bit(ASYNCB_INITIALIZED, &acm->port.flags))
+	if (tty_port_initialized(&acm->port))
 		tty_port_tty_hangup(&acm->port, false);
 
 	return acm_resume(intf);
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index a66b01bb1fa1..8967715fe6fc 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -127,7 +127,7 @@ static int usb_console_setup(struct console *co, char *options)
 	info->port = port;
 
 	++port->port.count;
-	if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags)) {
+	if (!tty_port_initialized(&port->port)) {
 		if (serial->type->set_termios) {
 			/*
 			 * allocate a fake tty so the driver can initialize
@@ -168,7 +168,7 @@ static int usb_console_setup(struct console *co, char *options)
 			tty_port_tty_set(&port->port, NULL);
 			tty_kref_put(tty);
 		}
-		set_bit(ASYNCB_INITIALIZED, &port->port.flags);
+		tty_port_set_initialized(&port->port, 1);
 	}
 	/* Now that any required fake tty operations are completed restore
 	 * the tty port count */
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 54e170dd3dad..ae8c0365abd6 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -473,7 +473,7 @@ static bool usb_serial_generic_msr_changed(struct tty_struct *tty,
 	 * Use tty-port initialised flag to detect all hangups including the
 	 * one generated at USB-device disconnect.
 	 */
-	if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+	if (!tty_port_initialized(&port->port))
 		return true;
 
 	spin_lock_irqsave(&port->lock, flags);
@@ -503,7 +503,7 @@ int usb_serial_generic_tiocmiwait(struct tty_struct *tty, unsigned long arg)
 
 	ret = wait_event_interruptible(port->port.delta_msr_wait,
 			usb_serial_generic_msr_changed(tty, arg, &cnow));
-	if (!ret && !test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+	if (!ret && !tty_port_initialized(&port->port))
 		ret = -EIO;
 
 	return ret;
@@ -606,7 +606,7 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		if (port->bulk_in_size) {
diff --git a/drivers/usb/serial/mxuport.c b/drivers/usb/serial/mxuport.c
index 31a8b47f1ac6..3722d6c1ba77 100644
--- a/drivers/usb/serial/mxuport.c
+++ b/drivers/usb/serial/mxuport.c
@@ -503,7 +503,7 @@ static void mxuport_process_read_urb_demux_data(struct urb *urb)
 			return;
 		}
 
-		if (test_bit(ASYNCB_INITIALIZED, &demux_port->port.flags)) {
+		if (tty_port_initialized(&demux_port->port)) {
 			ch = data + HEADER_SIZE;
 			mxuport_process_read_urb_data(demux_port, ch, rcv_len);
 		} else {
@@ -544,7 +544,7 @@ static void mxuport_process_read_urb_demux_event(struct urb *urb)
 		}
 
 		demux_port = serial->port[rcv_port];
-		if (test_bit(ASYNCB_INITIALIZED, &demux_port->port.flags)) {
+		if (tty_port_initialized(&demux_port->port)) {
 			ch = data + HEADER_SIZE;
 			rcv_event = get_unaligned_be16(data + 2);
 			mxuport_process_read_urb_event(demux_port, ch,
@@ -1339,7 +1339,7 @@ static int mxuport_resume(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		r = usb_serial_generic_write_start(port, GFP_NOIO);
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 07d1ecd564f7..e1994e264cc0 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -776,7 +776,7 @@ static void sierra_close(struct usb_serial_port *port)
 
 	/*
 	 * Need to take susp_lock to make sure port is not already being
-	 * resumed, but no need to hold it due to ASYNC_INITIALIZED.
+	 * resumed, but no need to hold it due to initialized
 	 */
 	spin_lock_irq(&intfdata->susp_lock);
 	if (--intfdata->open_ports == 0)
@@ -1039,7 +1039,7 @@ static int sierra_resume(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
 
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		err = sierra_submit_delayed_urbs(port);
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 46f1f13b41f1..3f253aec0c16 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -254,7 +254,7 @@ static int serial_open(struct tty_struct *tty, struct file *filp)
  *
  * Shut down a USB serial port. Serialized against activate by the
  * tport mutex and kept to matching open/close pairs
- * of calls by the ASYNCB_INITIALIZED flag.
+ * of calls by the initialized flag.
  *
  * Not called if tty is console.
  */
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index be9cb61b4d19..3dfdfc81254b 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -464,7 +464,7 @@ void usb_wwan_close(struct usb_serial_port *port)
 
 	/*
 	 * Need to take susp_lock to make sure port is not already being
-	 * resumed, but no need to hold it due to ASYNC_INITIALIZED.
+	 * resumed, but no need to hold it due to initialized
 	 */
 	spin_lock_irq(&intfdata->susp_lock);
 	if (--intfdata->open_ports == 0)
@@ -682,7 +682,7 @@ int usb_wwan_resume(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
 
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		portdata = usb_get_serial_port_data(port);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7ac5add66c00..bf1bcdb01df0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -610,6 +610,19 @@ static inline void tty_port_set_suspended(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_SUSPENDED, &port->iflags);
 }
 
+static inline bool tty_port_initialized(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_INITIALIZED, &port->iflags);
+}
+
+static inline void tty_port_set_initialized(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_INITIALIZED, &port->iflags);
+	else
+		clear_bit(TTY_PORT_INITIALIZED, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 5b7ce599c709..873c4b707d6a 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -220,10 +220,11 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
 
 	/* Check if already open */
-	if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) {
+	if (tty_port_initialized(&self->port)) {
 		pr_debug("%s(), already open so break out!\n", __func__);
 		return 0;
 	}
+	tty_port_set_initialized(&self->port, 1);
 
 	/* Register with IrCOMM */
 	irda_notify_init(&notify);
@@ -257,7 +258,7 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
 
 	return 0;
 err:
-	clear_bit(ASYNCB_INITIALIZED, &self->port.flags);
+	tty_port_set_initialized(&self->port, 0);
 	return ret;
 }
 
@@ -318,13 +319,12 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) ||
-		    !test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
@@ -876,8 +876,9 @@ static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
 	IRDA_ASSERT(self != NULL, return;);
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
 
-	if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags))
+	if (!tty_port_initialized(&self->port))
 		return;
+	tty_port_set_initialized(&self->port, 0);
 
 	ircomm_tty_detach_cable(self);
 
@@ -1259,7 +1260,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_CHECK_CD", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&self->port)) {
 		seq_printf(m, "%cASYNC_INITIALIZED", sep);
 		sep = '|';
 	}
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index e24724db36a2..d4fdf8f7b471 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -324,7 +324,7 @@ static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
 
  check_and_exit:
 
-	if (self->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(self)) {
 		if (((old_state.flags & ASYNC_SPD_MASK) !=
 		     (self->flags & ASYNC_SPD_MASK)) ||
 		    (old_driver.custom_divisor != driver->custom_divisor)) {
-- 
cgit v1.2.3


From 9ed19428a51d53477e2b79be3303fa08f8575749 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 18:56:34 -0700
Subject: serial: core: Prevent unsafe uart port access, part 2

For serial core operations not already excluded by holding port->mutex,
use reference counting to protect deferencing the state->uart_port.

Introduce helper functions, uart_port_ref() and uart_port_deref(), to
wrap uart_port access, and helper macros, uart_port_lock() and
uart_port_unlock(), to wrap combination uart_port access with uart
port lock sections.

Port removal in uart_remove_one_port() waits for reference count to
drop to zero before detaching the uart port from struct uart_state.

For functions only reading the tx circular buffer indexes (where the
uart port lock is claimed to prevent concurrent users), a NULL uart
port is simply ignored and the operation completes normally.

For functions change the tx circular buffer indexes (where the uart
port lock is claimed to prevent concurrent users), the operation is
aborted if the uart port is NULL (ie., has been detached).

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 171 +++++++++++++++++++++++++++++----------
 include/linux/serial_core.h      |   2 +
 2 files changed, 130 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index e605f0328182..1887f9c71f85 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -64,6 +64,35 @@ static int uart_dcd_enabled(struct uart_port *uport)
 	return !!(uport->status & UPSTAT_DCD_ENABLE);
 }
 
+static inline struct uart_port *uart_port_ref(struct uart_state *state)
+{
+	if (atomic_add_unless(&state->refcount, 1, 0))
+		return state->uart_port;
+	return NULL;
+}
+
+static inline void uart_port_deref(struct uart_port *uport)
+{
+	if (uport && atomic_dec_and_test(&uport->state->refcount))
+		wake_up(&uport->state->remove_wait);
+}
+
+#define uart_port_lock(state, flags)					\
+	({								\
+		struct uart_port *__uport = uart_port_ref(state);	\
+		if (__uport)						\
+			spin_lock_irqsave(&__uport->lock, flags);	\
+		__uport;						\
+	})
+
+#define uart_port_unlock(uport, flags)					\
+	({								\
+		struct uart_port *__uport = uport;			\
+		if (__uport)						\
+			spin_unlock_irqrestore(&__uport->lock, flags);	\
+		uart_port_deref(__uport);				\
+	})
+
 static inline struct uart_port *uart_port_check(struct uart_state *state)
 {
 #ifdef CONFIG_LOCKDEP
@@ -90,12 +119,13 @@ void uart_write_wakeup(struct uart_port *port)
 static void uart_stop(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long flags;
 
-	spin_lock_irqsave(&port->lock, flags);
-	port->ops->stop_tx(port);
-	spin_unlock_irqrestore(&port->lock, flags);
+	port = uart_port_lock(state, flags);
+	if (port)
+		port->ops->stop_tx(port);
+	uart_port_unlock(port, flags);
 }
 
 static void __uart_start(struct tty_struct *tty)
@@ -103,19 +133,19 @@ static void __uart_start(struct tty_struct *tty)
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->uart_port;
 
-	if (!uart_tx_stopped(port))
+	if (port && !uart_tx_stopped(port))
 		port->ops->start_tx(port);
 }
 
 static void uart_start(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long flags;
 
-	spin_lock_irqsave(&port->lock, flags);
+	port = uart_port_lock(state, flags);
 	__uart_start(tty);
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock(port, flags);
 }
 
 static void
@@ -496,7 +526,7 @@ static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 static int uart_put_char(struct tty_struct *tty, unsigned char c)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	struct circ_buf *circ;
 	unsigned long flags;
 	int ret = 0;
@@ -505,13 +535,13 @@ static int uart_put_char(struct tty_struct *tty, unsigned char c)
 	if (!circ->buf)
 		return 0;
 
-	spin_lock_irqsave(&port->lock, flags);
-	if (uart_circ_chars_free(circ) != 0) {
+	port = uart_port_lock(state, flags);
+	if (port && uart_circ_chars_free(circ) != 0) {
 		circ->buf[circ->head] = c;
 		circ->head = (circ->head + 1) & (UART_XMIT_SIZE - 1);
 		ret = 1;
 	}
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
@@ -538,14 +568,12 @@ static int uart_write(struct tty_struct *tty,
 		return -EL3HLT;
 	}
 
-	port = state->uart_port;
 	circ = &state->xmit;
-
 	if (!circ->buf)
 		return 0;
 
-	spin_lock_irqsave(&port->lock, flags);
-	while (1) {
+	port = uart_port_lock(state, flags);
+	while (port) {
 		c = CIRC_SPACE_TO_END(circ->head, circ->tail, UART_XMIT_SIZE);
 		if (count < c)
 			c = count;
@@ -559,32 +587,33 @@ static int uart_write(struct tty_struct *tty,
 	}
 
 	__uart_start(tty);
-	spin_unlock_irqrestore(&port->lock, flags);
-
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
 static int uart_write_room(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	struct uart_port *port;
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&state->uart_port->lock, flags);
+	port = uart_port_lock(state, flags);
 	ret = uart_circ_chars_free(&state->xmit);
-	spin_unlock_irqrestore(&state->uart_port->lock, flags);
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
 static int uart_chars_in_buffer(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	struct uart_port *port;
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&state->uart_port->lock, flags);
+	port = uart_port_lock(state, flags);
 	ret = uart_circ_chars_pending(&state->xmit);
-	spin_unlock_irqrestore(&state->uart_port->lock, flags);
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
@@ -603,14 +632,15 @@ static void uart_flush_buffer(struct tty_struct *tty)
 		return;
 	}
 
-	port = state->uart_port;
 	pr_debug("uart_flush_buffer(%d) called\n", tty->index);
 
-	spin_lock_irqsave(&port->lock, flags);
+	port = uart_port_lock(state, flags);
+	if (!port)
+		return;
 	uart_circ_clear(&state->xmit);
 	if (port->ops->flush_buffer)
 		port->ops->flush_buffer(port);
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock(port, flags);
 	tty_wakeup(tty);
 }
 
@@ -621,9 +651,13 @@ static void uart_flush_buffer(struct tty_struct *tty)
 static void uart_send_xchar(struct tty_struct *tty, char ch)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long flags;
 
+	port = uart_port_ref(state);
+	if (!port)
+		return;
+
 	if (port->ops->send_xchar)
 		port->ops->send_xchar(port, ch);
 	else {
@@ -633,14 +667,19 @@ static void uart_send_xchar(struct tty_struct *tty, char ch)
 			port->ops->start_tx(port);
 		spin_unlock_irqrestore(&port->lock, flags);
 	}
+	uart_port_deref(port);
 }
 
 static void uart_throttle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	upstat_t mask = 0;
 
+	port = uart_port_ref(state);
+	if (!port)
+		return;
+
 	if (I_IXOFF(tty))
 		mask |= UPSTAT_AUTOXOFF;
 	if (C_CRTSCTS(tty))
@@ -656,14 +695,20 @@ static void uart_throttle(struct tty_struct *tty)
 
 	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, STOP_CHAR(tty));
+
+	uart_port_deref(port);
 }
 
 static void uart_unthrottle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	upstat_t mask = 0;
 
+	port = uart_port_ref(state);
+	if (!port)
+		return;
+
 	if (I_IXOFF(tty))
 		mask |= UPSTAT_AUTOXOFF;
 	if (C_CRTSCTS(tty))
@@ -679,6 +724,8 @@ static void uart_unthrottle(struct tty_struct *tty)
 
 	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, START_CHAR(tty));
+
+	uart_port_deref(port);
 }
 
 static int uart_get_info(struct tty_port *port, struct serial_struct *retinfo)
@@ -1116,10 +1163,9 @@ static void uart_enable_ms(struct uart_port *uport)
  * FIXME: This wants extracting into a common all driver implementation
  * of TIOCMWAIT using tty_port.
  */
-static int
-uart_wait_modem_status(struct uart_state *state, unsigned long arg)
+static int uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 {
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
 	struct tty_port *port = &state->port;
 	DECLARE_WAITQUEUE(wait, current);
 	struct uart_icount cprev, cnow;
@@ -1128,6 +1174,9 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	/*
 	 * note the counters on entry
 	 */
+	uport = uart_port_ref(state);
+	if (!uport)
+		return -EIO;
 	spin_lock_irq(&uport->lock);
 	memcpy(&cprev, &uport->icount, sizeof(struct uart_icount));
 	uart_enable_ms(uport);
@@ -1161,6 +1210,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	}
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&port->delta_msr_wait, &wait);
+	uart_port_deref(uport);
 
 	return ret;
 }
@@ -1176,11 +1226,15 @@ static int uart_get_icount(struct tty_struct *tty,
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_icount cnow;
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
 
+	uport = uart_port_ref(state);
+	if (!uport)
+		return -EIO;
 	spin_lock_irq(&uport->lock);
 	memcpy(&cnow, &uport->icount, sizeof(struct uart_icount));
 	spin_unlock_irq(&uport->lock);
+	uart_port_deref(uport);
 
 	icount->cts         = cnow.cts;
 	icount->dsr         = cnow.dsr;
@@ -1481,11 +1535,14 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long char_time, expire;
 
-	if (port->type == PORT_UNKNOWN || port->fifosize == 0)
+	port = uart_port_ref(state);
+	if (!port || port->type == PORT_UNKNOWN || port->fifosize == 0) {
+		uart_port_deref(port);
 		return;
+	}
 
 	/*
 	 * Set the check interval to be 1/5 of the estimated time to
@@ -1531,6 +1588,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 		if (time_after(jiffies, expire))
 			break;
 	}
+	uart_port_deref(port);
 }
 
 /*
@@ -1591,12 +1649,23 @@ static void uart_port_shutdown(struct tty_port *port)
 static int uart_carrier_raised(struct tty_port *port)
 {
 	struct uart_state *state = container_of(port, struct uart_state, port);
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
 	int mctrl;
+
+	uport = uart_port_ref(state);
+	/*
+	 * Should never observe uport == NULL since checks for hangup should
+	 * abort the tty_port_block_til_ready() loop before checking for carrier
+	 * raised -- but report carrier raised if it does anyway so open will
+	 * continue and not sleep
+	 */
+	if (WARN_ON(!uport))
+		return 1;
 	spin_lock_irq(&uport->lock);
 	uart_enable_ms(uport);
 	mctrl = uport->ops->get_mctrl(uport);
 	spin_unlock_irq(&uport->lock);
+	uart_port_deref(uport);
 	if (mctrl & TIOCM_CAR)
 		return 1;
 	return 0;
@@ -1605,12 +1674,18 @@ static int uart_carrier_raised(struct tty_port *port)
 static void uart_dtr_rts(struct tty_port *port, int onoff)
 {
 	struct uart_state *state = container_of(port, struct uart_state, port);
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
+
+	uport = uart_port_ref(state);
+	if (!uport)
+		return;
 
 	if (onoff)
 		uart_set_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
 	else
 		uart_clear_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
+
+	uart_port_deref(uport);
 }
 
 /*
@@ -2320,12 +2395,15 @@ static int uart_poll_get_char(struct tty_driver *driver, int line)
 	struct uart_driver *drv = driver->driver_state;
 	struct uart_state *state = drv->state + line;
 	struct uart_port *port;
+	int ret = -1;
 
-	if (!state || !state->uart_port)
-		return -1;
-
-	port = state->uart_port;
-	return port->ops->poll_get_char(port);
+	if (state) {
+		port = uart_port_ref(state);
+		if (port)
+			ret = port->ops->poll_get_char(port);
+		uart_port_deref(port);
+	}
+	return ret;
 }
 
 static void uart_poll_put_char(struct tty_driver *driver, int line, char ch)
@@ -2334,14 +2412,17 @@ static void uart_poll_put_char(struct tty_driver *driver, int line, char ch)
 	struct uart_state *state = drv->state + line;
 	struct uart_port *port;
 
-	if (!state || !state->uart_port)
+	if (!state)
 		return;
 
-	port = state->uart_port;
+	port = uart_port_ref(state);
+	if (!port)
+		return;
 
 	if (ch == '\n')
 		port->ops->poll_put_char(port, '\r');
 	port->ops->poll_put_char(port, ch);
+	uart_port_deref(port);
 }
 #endif
 
@@ -2688,6 +2769,8 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
 	}
 
 	/* Link the port to the driver state table and vice versa */
+	atomic_set(&state->refcount, 1);
+	init_waitqueue_head(&state->remove_wait);
 	state->uart_port = uport;
 	uport->state = state;
 
@@ -2816,6 +2899,8 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport)
 	uport->type = PORT_UNKNOWN;
 
 	mutex_lock(&port->mutex);
+	WARN_ON(atomic_dec_return(&state->refcount) < 0);
+	wait_event(state->remove_wait, !atomic_read(&state->refcount));
 	state->uart_port = NULL;
 	mutex_unlock(&port->mutex);
 out:
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index cbfcf38e220d..fd4ad4dce11a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -281,6 +281,8 @@ struct uart_state {
 	enum uart_pm_state	pm_state;
 	struct circ_buf		xmit;
 
+	atomic_t		refcount;
+	wait_queue_head_t	remove_wait;
 	struct uart_port	*uart_port;
 };
 
-- 
cgit v1.2.3


From 8ede5cce4f0baff77ef63aa3cb3afc65d0317e0b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:08:16 +0200
Subject: tty: vt, make color_table const

This means all ->con_set_palette have to have the second parameter
const too now.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c                     | 2 +-
 drivers/usb/misc/sisusbvga/sisusb_con.c | 2 +-
 drivers/video/console/fbcon.c           | 4 ++--
 drivers/video/console/mdacon.c          | 2 +-
 drivers/video/console/newport_con.c     | 2 +-
 drivers/video/console/sticon.c          | 2 +-
 drivers/video/console/vgacon.c          | 5 ++---
 include/linux/console.h                 | 2 +-
 include/linux/selection.h               | 2 +-
 9 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index d1da391febb4..2c71b3bde174 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1039,7 +1039,7 @@ struct vc_data *vc_deallocate(unsigned int currcons)
 #define VT100ID "\033[?1;2c"
 #define VT102ID "\033[?6c"
 
-unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7,
+const unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7,
 				       8,12,10,14, 9,13,11,15 };
 
 /* the default colour table, for VGA+ colour systems */
diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index ace343088915..afa853209f1d 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c
@@ -601,7 +601,7 @@ sisusbcon_save_screen(struct vc_data *c)
 
 /* interface routine */
 static int
-sisusbcon_set_palette(struct vc_data *c, unsigned char *table)
+sisusbcon_set_palette(struct vc_data *c, const unsigned char *table)
 {
 	struct sisusb_usb_data *sisusb;
 	int i, j;
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 6e92917ba77a..afd3301ac40c 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -170,7 +170,7 @@ static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx,
 			int height, int width);
 static int fbcon_switch(struct vc_data *vc);
 static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch);
-static int fbcon_set_palette(struct vc_data *vc, unsigned char *table);
+static int fbcon_set_palette(struct vc_data *vc, const unsigned char *table);
 static int fbcon_scrolldelta(struct vc_data *vc, int lines);
 
 /*
@@ -2652,7 +2652,7 @@ static struct fb_cmap palette_cmap = {
 	0, 16, palette_red, palette_green, palette_blue, NULL
 };
 
-static int fbcon_set_palette(struct vc_data *vc, unsigned char *table)
+static int fbcon_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	int i, j, k, depth;
diff --git a/drivers/video/console/mdacon.c b/drivers/video/console/mdacon.c
index 296e94561556..8edc062536a8 100644
--- a/drivers/video/console/mdacon.c
+++ b/drivers/video/console/mdacon.c
@@ -481,7 +481,7 @@ static int mdacon_switch(struct vc_data *c)
 	return 1;	/* redrawing needed */
 }
 
-static int mdacon_set_palette(struct vc_data *c, unsigned char *table)
+static int mdacon_set_palette(struct vc_data *c, const unsigned char *table)
 {
 	return -EINVAL;
 }
diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c
index bb4e96255974..0553dfe684ef 100644
--- a/drivers/video/console/newport_con.c
+++ b/drivers/video/console/newport_con.c
@@ -574,7 +574,7 @@ static int newport_font_set(struct vc_data *vc, struct console_font *font, unsig
 	return newport_set_font(vc->vc_num, font);
 }
 
-static int newport_set_palette(struct vc_data *vc, unsigned char *table)
+static int newport_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 	return -EINVAL;
 }
diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c
index 026fd1215933..e440c2d9fe7c 100644
--- a/drivers/video/console/sticon.c
+++ b/drivers/video/console/sticon.c
@@ -79,7 +79,7 @@ static const char *sticon_startup(void)
     return "STI console";
 }
 
-static int sticon_set_palette(struct vc_data *c, unsigned char *table)
+static int sticon_set_palette(struct vc_data *c, const unsigned char *table)
 {
     return -EINVAL;
 }
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 517f565b65d7..8bf911002cba 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -80,7 +80,6 @@ static void vgacon_deinit(struct vc_data *c);
 static void vgacon_cursor(struct vc_data *c, int mode);
 static int vgacon_switch(struct vc_data *c);
 static int vgacon_blank(struct vc_data *c, int blank, int mode_switch);
-static int vgacon_set_palette(struct vc_data *vc, unsigned char *table);
 static int vgacon_scrolldelta(struct vc_data *c, int lines);
 static int vgacon_set_origin(struct vc_data *c);
 static void vgacon_save_screen(struct vc_data *c);
@@ -847,7 +846,7 @@ static int vgacon_switch(struct vc_data *c)
 	return 0;		/* Redrawing not needed */
 }
 
-static void vga_set_palette(struct vc_data *vc, unsigned char *table)
+static void vga_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 	int i, j;
 
@@ -860,7 +859,7 @@ static void vga_set_palette(struct vc_data *vc, unsigned char *table)
 	}
 }
 
-static int vgacon_set_palette(struct vc_data *vc, unsigned char *table)
+static int vgacon_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 #ifdef CAN_LOAD_PALETTE
 	if (vga_video_type != VIDEO_TYPE_VGAC || vga_palette_blanked
diff --git a/include/linux/console.h b/include/linux/console.h
index ea731af2451e..137ac1a1c16f 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -47,7 +47,7 @@ struct consw {
 	int	(*con_font_copy)(struct vc_data *, int);
 	int     (*con_resize)(struct vc_data *, unsigned int, unsigned int,
 			       unsigned int);
-	int	(*con_set_palette)(struct vc_data *, unsigned char *);
+	int	(*con_set_palette)(struct vc_data *, const unsigned char *);
 	int	(*con_scrolldelta)(struct vc_data *, int);
 	int	(*con_set_origin)(struct vc_data *);
 	void	(*con_save_screen)(struct vc_data *);
diff --git a/include/linux/selection.h b/include/linux/selection.h
index 85193aa8c1e3..7e6c4450b8a5 100644
--- a/include/linux/selection.h
+++ b/include/linux/selection.h
@@ -24,7 +24,7 @@ extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry);
 
 extern int console_blanked;
 
-extern unsigned char color_table[];
+extern const unsigned char color_table[];
 extern int default_red[];
 extern int default_grn[];
 extern int default_blu[];
-- 
cgit v1.2.3


From 91e74ca5e7ac4ec6c61b84d6618eb5e401f852f0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:08:17 +0200
Subject: tty: vt, use proper type for default colors

Every user of default_red, default_grn, and default_blu treats them as
unsigned char. So make it really unsigned char.

And indent the initializers and module_param properly.

This saves ~ 100 bytes of data.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c       | 27 +++++++++++++++++----------
 include/linux/selection.h |  6 +++---
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 2c71b3bde174..8f9f8ed3ed09 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1043,16 +1043,23 @@ const unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7,
 				       8,12,10,14, 9,13,11,15 };
 
 /* the default colour table, for VGA+ colour systems */
-int default_red[] = {0x00,0xaa,0x00,0xaa,0x00,0xaa,0x00,0xaa,
-    0x55,0xff,0x55,0xff,0x55,0xff,0x55,0xff};
-int default_grn[] = {0x00,0x00,0xaa,0x55,0x00,0x00,0xaa,0xaa,
-    0x55,0x55,0xff,0xff,0x55,0x55,0xff,0xff};
-int default_blu[] = {0x00,0x00,0x00,0x00,0xaa,0xaa,0xaa,0xaa,
-    0x55,0x55,0x55,0x55,0xff,0xff,0xff,0xff};
-
-module_param_array(default_red, int, NULL, S_IRUGO | S_IWUSR);
-module_param_array(default_grn, int, NULL, S_IRUGO | S_IWUSR);
-module_param_array(default_blu, int, NULL, S_IRUGO | S_IWUSR);
+unsigned char default_red[] = {
+	0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa,
+	0x55, 0xff, 0x55, 0xff, 0x55, 0xff, 0x55, 0xff
+};
+module_param_array(default_red, byte, NULL, S_IRUGO | S_IWUSR);
+
+unsigned char default_grn[] = {
+	0x00, 0x00, 0xaa, 0x55, 0x00, 0x00, 0xaa, 0xaa,
+	0x55, 0x55, 0xff, 0xff, 0x55, 0x55, 0xff, 0xff
+};
+module_param_array(default_grn, byte, NULL, S_IRUGO | S_IWUSR);
+
+unsigned char default_blu[] = {
+	0x00, 0x00, 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa,
+	0x55, 0x55, 0x55, 0x55, 0xff, 0xff, 0xff, 0xff
+};
+module_param_array(default_blu, byte, NULL, S_IRUGO | S_IWUSR);
 
 /*
  * gotoxy() must verify all boundaries, because the arguments
diff --git a/include/linux/selection.h b/include/linux/selection.h
index 7e6c4450b8a5..8e4624efdb6f 100644
--- a/include/linux/selection.h
+++ b/include/linux/selection.h
@@ -25,9 +25,9 @@ extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry);
 extern int console_blanked;
 
 extern const unsigned char color_table[];
-extern int default_red[];
-extern int default_grn[];
-extern int default_blu[];
+extern unsigned char default_red[];
+extern unsigned char default_grn[];
+extern unsigned char default_blu[];
 
 extern unsigned short *screen_pos(struct vc_data *vc, int w_offset, int viewed);
 extern u16 screen_glyph(struct vc_data *vc, int offset);
-- 
cgit v1.2.3


From 144ef5c2df9b473dad7eab375adcf5b11d0b1e47 Mon Sep 17 00:00:00 2001
From: Wan Ahmad Zainie <wan.ahmad.zainie.wan.mohamad@intel.com>
Date: Wed, 6 Apr 2016 12:06:51 +0800
Subject: serial: 8250: export get_mctrl function

Exposes get_mctrl() function so that it can be overriden with platform
specific implementation.

Signed-off-by: Wan Ahmad Zainie <wan.ahmad.zainie.wan.mohamad@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c |  3 +++
 drivers/tty/serial/8250/8250_port.c | 10 +++++++++-
 include/linux/serial_8250.h         |  2 ++
 include/linux/serial_core.h         |  1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 2f4f5ee651db..0fbd7c033a25 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -830,6 +830,7 @@ static int serial8250_probe(struct platform_device *dev)
 		uart.port.handle_irq	= p->handle_irq;
 		uart.port.handle_break	= p->handle_break;
 		uart.port.set_termios	= p->set_termios;
+		uart.port.get_mctrl	= p->get_mctrl;
 		uart.port.pm		= p->pm;
 		uart.port.dev		= &dev->dev;
 		uart.port.irqflags	|= irqflag;
@@ -1022,6 +1023,8 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		/*  Possibly override set_termios call */
 		if (up->port.set_termios)
 			uart->port.set_termios = up->port.set_termios;
+		if (up->port.get_mctrl)
+			uart->port.get_mctrl = up->port.get_mctrl;
 		if (up->port.set_mctrl)
 			uart->port.set_mctrl = up->port.set_mctrl;
 		if (up->port.startup)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 00ad2637b08c..a803ddfd5a59 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1882,7 +1882,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port)
 	return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
 }
 
-static unsigned int serial8250_get_mctrl(struct uart_port *port)
+unsigned int serial8250_do_get_mctrl(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned int status;
@@ -1903,6 +1903,14 @@ static unsigned int serial8250_get_mctrl(struct uart_port *port)
 		ret |= TIOCM_CTS;
 	return ret;
 }
+EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);
+
+static unsigned int serial8250_get_mctrl(struct uart_port *port)
+{
+	if (port->get_mctrl)
+		return port->get_mctrl(port);
+	return serial8250_do_get_mctrl(port);
+}
 
 void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 434879759725..48ec7651989b 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -36,6 +36,7 @@ struct plat_serial8250_port {
 	void		(*set_termios)(struct uart_port *,
 			               struct ktermios *new,
 			               struct ktermios *old);
+	unsigned int	(*get_mctrl)(struct uart_port *);
 	int		(*handle_irq)(struct uart_port *);
 	void		(*pm)(struct uart_port *, unsigned int state,
 			      unsigned old);
@@ -148,6 +149,7 @@ extern int early_serial8250_setup(struct earlycon_device *device,
 					 const char *options);
 extern void serial8250_do_set_termios(struct uart_port *port,
 		struct ktermios *termios, struct ktermios *old);
+extern unsigned int serial8250_do_get_mctrl(struct uart_port *port);
 extern int serial8250_do_startup(struct uart_port *port);
 extern void serial8250_do_shutdown(struct uart_port *port);
 extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index fd4ad4dce11a..a3d7c0d4a03e 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -123,6 +123,7 @@ struct uart_port {
 	void			(*set_termios)(struct uart_port *,
 				               struct ktermios *new,
 				               struct ktermios *old);
+	unsigned int		(*get_mctrl)(struct uart_port *);
 	void			(*set_mctrl)(struct uart_port *, unsigned int);
 	int			(*startup)(struct uart_port *port);
 	void			(*shutdown)(struct uart_port *port);
-- 
cgit v1.2.3


From e4234a1fc343ca35f852bc527fae56fade879d4a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 31 Mar 2016 11:45:06 +0100
Subject: kernfs: Move faulting copy_user operations outside of the mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A fault in a user provided buffer may lead anywhere, and lockdep warns
that we have a potential deadlock between the mm->mmap_sem and the
kernfs file mutex:

[   82.811702] ======================================================
[   82.811705] [ INFO: possible circular locking dependency detected ]
[   82.811709] 4.5.0-rc4-gfxbench+ #1 Not tainted
[   82.811711] -------------------------------------------------------
[   82.811714] kms_setmode/5859 is trying to acquire lock:
[   82.811717]  (&dev->struct_mutex){+.+.+.}, at: [<ffffffff8150d9c1>] drm_gem_mmap+0x1a1/0x270
[   82.811731]
but task is already holding lock:
[   82.811734]  (&mm->mmap_sem){++++++}, at: [<ffffffff8117b364>] vm_mmap_pgoff+0x44/0xa0
[   82.811745]
which lock already depends on the new lock.

[   82.811749]
the existing dependency chain (in reverse order) is:
[   82.811752]
-> #3 (&mm->mmap_sem){++++++}:
[   82.811761]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.811766]        [<ffffffff8118bc65>] __might_fault+0x75/0xa0
[   82.811771]        [<ffffffff8124da4a>] kernfs_fop_write+0x8a/0x180
[   82.811787]        [<ffffffff811d1023>] __vfs_write+0x23/0xe0
[   82.811792]        [<ffffffff811d1d74>] vfs_write+0xa4/0x190
[   82.811797]        [<ffffffff811d2c14>] SyS_write+0x44/0xb0
[   82.811801]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.811807]
-> #2 (s_active#6){++++.+}:
[   82.811814]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.811819]        [<ffffffff8124c070>] __kernfs_remove+0x210/0x2f0
[   82.811823]        [<ffffffff8124d040>] kernfs_remove_by_name_ns+0x40/0xa0
[   82.811828]        [<ffffffff8124e9e0>] sysfs_remove_file_ns+0x10/0x20
[   82.811832]        [<ffffffff815318d4>] device_del+0x124/0x250
[   82.811837]        [<ffffffff81531a19>] device_unregister+0x19/0x60
[   82.811841]        [<ffffffff8153c051>] cpu_cache_sysfs_exit+0x51/0xb0
[   82.811846]        [<ffffffff8153c628>] cacheinfo_cpu_callback+0x38/0x70
[   82.811851]        [<ffffffff8109ae89>] notifier_call_chain+0x39/0xa0
[   82.811856]        [<ffffffff8109aef9>] __raw_notifier_call_chain+0x9/0x10
[   82.811860]        [<ffffffff810786de>] cpu_notify+0x1e/0x40
[   82.811865]        [<ffffffff81078779>] cpu_notify_nofail+0x9/0x20
[   82.811869]        [<ffffffff81078ac3>] _cpu_down+0x233/0x340
[   82.811874]        [<ffffffff81079019>] disable_nonboot_cpus+0xc9/0x350
[   82.811878]        [<ffffffff810d2e11>] suspend_devices_and_enter+0x5a1/0xb50
[   82.811883]        [<ffffffff810d3903>] pm_suspend+0x543/0x8d0
[   82.811888]        [<ffffffff810d1b77>] state_store+0x77/0xe0
[   82.811892]        [<ffffffff813fa68f>] kobj_attr_store+0xf/0x20
[   82.811897]        [<ffffffff8124e740>] sysfs_kf_write+0x40/0x50
[   82.811902]        [<ffffffff8124dafc>] kernfs_fop_write+0x13c/0x180
[   82.811906]        [<ffffffff811d1023>] __vfs_write+0x23/0xe0
[   82.811910]        [<ffffffff811d1d74>] vfs_write+0xa4/0x190
[   82.811914]        [<ffffffff811d2c14>] SyS_write+0x44/0xb0
[   82.811918]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.811923]
-> #1 (cpu_hotplug.lock){+.+.+.}:
[   82.811929]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.811933]        [<ffffffff817b6f72>] mutex_lock_nested+0x62/0x3b0
[   82.811940]        [<ffffffff810784c1>] get_online_cpus+0x61/0x80
[   82.811944]        [<ffffffff811170eb>] stop_machine+0x1b/0xe0
[   82.811949]        [<ffffffffa0178edd>] gen8_ggtt_insert_entries__BKL+0x2d/0x30 [i915]
[   82.812009]        [<ffffffffa017d3a6>] ggtt_bind_vma+0x46/0x70 [i915]
[   82.812045]        [<ffffffffa017eb70>] i915_vma_bind+0x140/0x290 [i915]
[   82.812081]        [<ffffffffa01862b9>] i915_gem_object_do_pin+0x899/0xb00 [i915]
[   82.812117]        [<ffffffffa0186555>] i915_gem_object_pin+0x35/0x40 [i915]
[   82.812154]        [<ffffffffa019a23e>] intel_init_pipe_control+0xbe/0x210 [i915]
[   82.812192]        [<ffffffffa0197312>] intel_logical_rings_init+0xe2/0xde0 [i915]
[   82.812232]        [<ffffffffa0186fe3>] i915_gem_init+0xf3/0x130 [i915]
[   82.812278]        [<ffffffffa02097ed>] i915_driver_load+0xf2d/0x1770 [i915]
[   82.812318]        [<ffffffff81512474>] drm_dev_register+0xa4/0xb0
[   82.812323]        [<ffffffff8151467e>] drm_get_pci_dev+0xce/0x1e0
[   82.812328]        [<ffffffffa01472cf>] i915_pci_probe+0x2f/0x50 [i915]
[   82.812360]        [<ffffffff8143f907>] pci_device_probe+0x87/0xf0
[   82.812366]        [<ffffffff81535f89>] driver_probe_device+0x229/0x450
[   82.812371]        [<ffffffff81536233>] __driver_attach+0x83/0x90
[   82.812375]        [<ffffffff81533c61>] bus_for_each_dev+0x61/0xa0
[   82.812380]        [<ffffffff81535879>] driver_attach+0x19/0x20
[   82.812384]        [<ffffffff8153535f>] bus_add_driver+0x1ef/0x290
[   82.812388]        [<ffffffff81536e9b>] driver_register+0x5b/0xe0
[   82.812393]        [<ffffffff8143e83b>] __pci_register_driver+0x5b/0x60
[   82.812398]        [<ffffffff81514866>] drm_pci_init+0xd6/0x100
[   82.812402]        [<ffffffffa027c094>] 0xffffffffa027c094
[   82.812406]        [<ffffffff810003de>] do_one_initcall+0xae/0x1d0
[   82.812412]        [<ffffffff811595a0>] do_init_module+0x5b/0x1cb
[   82.812417]        [<ffffffff81106160>] load_module+0x1c20/0x2480
[   82.812422]        [<ffffffff81106bae>] SyS_finit_module+0x7e/0xa0
[   82.812428]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.812433]
-> #0 (&dev->struct_mutex){+.+.+.}:
[   82.812439]        [<ffffffff810cbe59>] __lock_acquire+0x1fc9/0x20f0
[   82.812443]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.812456]        [<ffffffff8150d9e7>] drm_gem_mmap+0x1c7/0x270
[   82.812460]        [<ffffffff81196a14>] mmap_region+0x334/0x580
[   82.812466]        [<ffffffff81196fc4>] do_mmap+0x364/0x410
[   82.812470]        [<ffffffff8117b38d>] vm_mmap_pgoff+0x6d/0xa0
[   82.812474]        [<ffffffff811950f4>] SyS_mmap_pgoff+0x184/0x220
[   82.812479]        [<ffffffff8100a0fd>] SyS_mmap+0x1d/0x20
[   82.812484]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.812489]
other info that might help us debug this:

[   82.812493] Chain exists of:
  &dev->struct_mutex --> s_active#6 --> &mm->mmap_sem

[   82.812502]  Possible unsafe locking scenario:

[   82.812506]        CPU0                    CPU1
[   82.812508]        ----                    ----
[   82.812510]   lock(&mm->mmap_sem);
[   82.812514]                                lock(s_active#6);
[   82.812519]                                lock(&mm->mmap_sem);
[   82.812522]   lock(&dev->struct_mutex);
[   82.812526]
 *** DEADLOCK ***

[   82.812531] 1 lock held by kms_setmode/5859:
[   82.812533]  #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff8117b364>] vm_mmap_pgoff+0x44/0xa0
[   82.812541]
stack backtrace:
[   82.812547] CPU: 0 PID: 5859 Comm: kms_setmode Not tainted 4.5.0-rc4-gfxbench+ #1
[   82.812550] Hardware name:                  /NUC5CPYB, BIOS PYBSWCEL.86A.0040.2015.0814.1353 08/14/2015
[   82.812553]  0000000000000000 ffff880079407bf0 ffffffff813f8505 ffffffff825fb270
[   82.812560]  ffffffff825c4190 ffff880079407c30 ffffffff810c84ac ffff880079407c90
[   82.812566]  ffff8800797ed328 ffff8800797ecb00 0000000000000001 ffff8800797ed350
[   82.812573] Call Trace:
[   82.812578]  [<ffffffff813f8505>] dump_stack+0x67/0x92
[   82.812582]  [<ffffffff810c84ac>] print_circular_bug+0x1fc/0x310
[   82.812586]  [<ffffffff810cbe59>] __lock_acquire+0x1fc9/0x20f0
[   82.812590]  [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.812594]  [<ffffffff8150d9c1>] ? drm_gem_mmap+0x1a1/0x270
[   82.812599]  [<ffffffff8150d9e7>] drm_gem_mmap+0x1c7/0x270
[   82.812603]  [<ffffffff8150d9c1>] ? drm_gem_mmap+0x1a1/0x270
[   82.812608]  [<ffffffff81196a14>] mmap_region+0x334/0x580
[   82.812612]  [<ffffffff81196fc4>] do_mmap+0x364/0x410
[   82.812616]  [<ffffffff8117b38d>] vm_mmap_pgoff+0x6d/0xa0
[   82.812629]  [<ffffffff811950f4>] SyS_mmap_pgoff+0x184/0x220
[   82.812633]  [<ffffffff8100a0fd>] SyS_mmap+0x1d/0x20
[   82.812637]  [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73

Highly unlikely though this scenario is, we can avoid the issue entirely
by moving the copy operation from out under the kernfs_get_active()
tracking by assigning the preallocated buffer its own mutex. The
temporary buffer allocation doesn't require mutex locking as it is
entirely local.

The locked section was extended by the addition of the preallocated buf
to speed up md user operations in

commit 2b75869bba676c248d8d25ae6d2bd9221dfffdb6
Author: NeilBrown <neilb@suse.de>
Date:   Mon Oct 13 16:41:28 2014 +1100

    sysfs/kernfs: allow attributes to request write buffer be pre-allocated.

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94350
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: NeilBrown <neilb@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 51 ++++++++++++++++++++++++++++----------------------
 include/linux/kernfs.h |  1 +
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 7247252ee9b1..e1574008adc9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -190,15 +190,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 	char *buf;
 
 	buf = of->prealloc_buf;
-	if (!buf)
+	if (buf)
+		mutex_lock(&of->prealloc_mutex);
+	else
 		buf = kmalloc(len, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	/*
 	 * @of->mutex nests outside active ref and is used both to ensure that
-	 * the ops aren't called concurrently for the same open file, and
-	 * to provide exclusive access to ->prealloc_buf (when that exists).
+	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
 	if (!kernfs_get_active(of->kn)) {
@@ -214,21 +215,23 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 	else
 		len = -EINVAL;
 
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
 	if (len < 0)
-		goto out_unlock;
+		goto out_free;
 
 	if (copy_to_user(user_buf, buf, len)) {
 		len = -EFAULT;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	*ppos += len;
 
- out_unlock:
-	kernfs_put_active(of->kn);
-	mutex_unlock(&of->mutex);
  out_free:
-	if (buf != of->prealloc_buf)
+	if (buf == of->prealloc_buf)
+		mutex_unlock(&of->prealloc_mutex);
+	else
 		kfree(buf);
 	return len;
 }
@@ -284,15 +287,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
 	}
 
 	buf = of->prealloc_buf;
-	if (!buf)
+	if (buf)
+		mutex_lock(&of->prealloc_mutex);
+	else
 		buf = kmalloc(len + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
 	/*
 	 * @of->mutex nests outside active ref and is used both to ensure that
-	 * the ops aren't called concurrently for the same open file, and
-	 * to provide exclusive access to ->prealloc_buf (when that exists).
+	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
 	if (!kernfs_get_active(of->kn)) {
@@ -301,26 +311,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
 		goto out_free;
 	}
 
-	if (copy_from_user(buf, user_buf, len)) {
-		len = -EFAULT;
-		goto out_unlock;
-	}
-	buf[len] = '\0';	/* guarantee string termination */
-
 	ops = kernfs_ops(of->kn);
 	if (ops->write)
 		len = ops->write(of, buf, len, *ppos);
 	else
 		len = -EINVAL;
 
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
 	if (len > 0)
 		*ppos += len;
 
-out_unlock:
-	kernfs_put_active(of->kn);
-	mutex_unlock(&of->mutex);
 out_free:
-	if (buf != of->prealloc_buf)
+	if (buf == of->prealloc_buf)
+		mutex_unlock(&of->prealloc_mutex);
+	else
 		kfree(buf);
 	return len;
 }
@@ -687,6 +693,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 		error = -ENOMEM;
 		if (!of->prealloc_buf)
 			goto err_free;
+		mutex_init(&of->prealloc_mutex);
 	}
 
 	/*
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c44242f39..d306e282bb1d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -177,6 +177,7 @@ struct kernfs_open_file {
 
 	/* private fields, do not use outside kernfs proper */
 	struct mutex		mutex;
+	struct mutex		prealloc_mutex;
 	int			event;
 	struct list_head	list;
 	char			*prealloc_buf;
-- 
cgit v1.2.3


From a6341f000024cdf1ec14dc26743a409a17378db5 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:46 -0700
Subject: Drivers: hv: vmbus: Introduce functions for estimating room in the
 ring buffer

Introduce separate functions for estimating how much can be read from
and written to the ring buffer.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c | 25 ++++---------------------
 include/linux/hyperv.h   | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index a40a73a7b71d..544362c6a9ca 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -38,8 +38,6 @@ void hv_begin_read(struct hv_ring_buffer_info *rbi)
 
 u32 hv_end_read(struct hv_ring_buffer_info *rbi)
 {
-	u32 read;
-	u32 write;
 
 	rbi->ring_buffer->interrupt_mask = 0;
 	mb();
@@ -49,9 +47,7 @@ u32 hv_end_read(struct hv_ring_buffer_info *rbi)
 	 * If it is not, we raced and we need to process new
 	 * incoming messages.
 	 */
-	hv_get_ringbuffer_availbytes(rbi, &read, &write);
-
-	return read;
+	return hv_get_bytes_to_read(rbi);
 }
 
 /*
@@ -106,9 +102,6 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi)
 static bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 {
 	u32 cur_write_sz;
-	u32 r_size;
-	u32 write_loc;
-	u32 read_loc = rbi->ring_buffer->read_index;
 	u32 pending_sz;
 
 	/*
@@ -125,14 +118,11 @@ static bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 	mb();
 
 	pending_sz = rbi->ring_buffer->pending_send_sz;
-	write_loc = rbi->ring_buffer->write_index;
 	/* If the other end is not blocked on write don't bother. */
 	if (pending_sz == 0)
 		return false;
 
-	r_size = rbi->ring_datasize;
-	cur_write_sz = write_loc >= read_loc ? r_size - (write_loc - read_loc) :
-			read_loc - write_loc;
+	cur_write_sz = hv_get_bytes_to_write(rbi);
 
 	if (cur_write_sz >= pending_sz)
 		return true;
@@ -332,7 +322,6 @@ int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
 {
 	int i = 0;
 	u32 bytes_avail_towrite;
-	u32 bytes_avail_toread;
 	u32 totalbytes_towrite = 0;
 
 	u32 next_write_location;
@@ -348,9 +337,7 @@ int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
 	if (lock)
 		spin_lock_irqsave(&outring_info->ring_lock, flags);
 
-	hv_get_ringbuffer_availbytes(outring_info,
-				&bytes_avail_toread,
-				&bytes_avail_towrite);
+	bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
 
 	/*
 	 * If there is only room for the packet, assume it is full.
@@ -401,7 +388,6 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
 		       u64 *requestid, bool *signal, bool raw)
 {
-	u32 bytes_avail_towrite;
 	u32 bytes_avail_toread;
 	u32 next_read_location = 0;
 	u64 prev_indices = 0;
@@ -417,10 +403,7 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
 	*buffer_actual_len = 0;
 	*requestid = 0;
 
-	hv_get_ringbuffer_availbytes(inring_info,
-				&bytes_avail_toread,
-				&bytes_avail_towrite);
-
+	bytes_avail_toread = hv_get_bytes_to_read(inring_info);
 	/* Make sure there is something to read */
 	if (bytes_avail_toread < sizeof(desc)) {
 		/*
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index aa0fadce9308..66226ceade37 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -151,6 +151,33 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi,
 	*read = dsize - *write;
 }
 
+static inline u32 hv_get_bytes_to_read(struct hv_ring_buffer_info *rbi)
+{
+	u32 read_loc, write_loc, dsize, read;
+
+	dsize = rbi->ring_datasize;
+	read_loc = rbi->ring_buffer->read_index;
+	write_loc = READ_ONCE(rbi->ring_buffer->write_index);
+
+	read = write_loc >= read_loc ? (write_loc - read_loc) :
+		(dsize - read_loc) + write_loc;
+
+	return read;
+}
+
+static inline u32 hv_get_bytes_to_write(struct hv_ring_buffer_info *rbi)
+{
+	u32 read_loc, write_loc, dsize, write;
+
+	dsize = rbi->ring_datasize;
+	read_loc = READ_ONCE(rbi->ring_buffer->read_index);
+	write_loc = rbi->ring_buffer->write_index;
+
+	write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
+		read_loc - write_loc;
+	return write;
+}
+
 /*
  * VMBUS version is 32 bit entity broken up into
  * two 16 bit quantities: major_number. minor_number.
-- 
cgit v1.2.3


From 5cc472477f928fb8584eb8e08245c9cf9002d74a Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:49 -0700
Subject: Drivers: hv: vmbus: Export the vmbus_set_event() API

In preparation for moving some ring buffer functionality out of the
vmbus driver, export the API for signaling the host.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/connection.c   | 1 +
 drivers/hv/hyperv_vmbus.h | 2 --
 include/linux/hyperv.h    | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index d02f1373dd98..fcf8a02dc0ea 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -495,3 +495,4 @@ void vmbus_set_event(struct vmbus_channel *channel)
 
 	hv_do_hypercall(HVCALL_SIGNAL_EVENT, channel->sig_event, NULL);
 }
+EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 12321b93a756..e5c586fab0e5 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -667,8 +667,6 @@ void vmbus_disconnect(void);
 
 int vmbus_post_msg(void *buffer, size_t buflen);
 
-void vmbus_set_event(struct vmbus_channel *channel);
-
 void vmbus_on_event(unsigned long data);
 void vmbus_on_msg_dpc(unsigned long data);
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 66226ceade37..40fd608475f7 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1365,4 +1365,5 @@ extern __u32 vmbus_proto_version;
 
 int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,
 				  const uuid_le *shv_host_servie_id);
+void vmbus_set_event(struct vmbus_channel *channel);
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 687f32e6d9bd1d63c5e557e877809eb446f1a6e8 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:50 -0700
Subject: Drivers: hv: vmbus: Move some ring buffer functions to hyperv.h

In preparation for implementing APIs for in-place consumption of VMBUS
packets, movve some ring buffer functionality into hyperv.h

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c | 55 ------------------------------------------------
 include/linux/hyperv.h   | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 8f518af3074d..dd255c9b9420 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -84,52 +84,6 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi)
 	return false;
 }
 
-/*
- * To optimize the flow management on the send-side,
- * when the sender is blocked because of lack of
- * sufficient space in the ring buffer, potential the
- * consumer of the ring buffer can signal the producer.
- * This is controlled by the following parameters:
- *
- * 1. pending_send_sz: This is the size in bytes that the
- *    producer is trying to send.
- * 2. The feature bit feat_pending_send_sz set to indicate if
- *    the consumer of the ring will signal when the ring
- *    state transitions from being full to a state where
- *    there is room for the producer to send the pending packet.
- */
-
-static bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
-{
-	u32 cur_write_sz;
-	u32 pending_sz;
-
-	/*
-	 * Issue a full memory barrier before making the signaling decision.
-	 * Here is the reason for having this barrier:
-	 * If the reading of the pend_sz (in this function)
-	 * were to be reordered and read before we commit the new read
-	 * index (in the calling function)  we could
-	 * have a problem. If the host were to set the pending_sz after we
-	 * have sampled pending_sz and go to sleep before we commit the
-	 * read index, we could miss sending the interrupt. Issue a full
-	 * memory barrier to address this.
-	 */
-	virt_mb();
-
-	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
-	/* If the other end is not blocked on write don't bother. */
-	if (pending_sz == 0)
-		return false;
-
-	cur_write_sz = hv_get_bytes_to_write(rbi);
-
-	if (cur_write_sz >= pending_sz)
-		return true;
-
-	return false;
-}
-
 /* Get the next write location for the specified ring buffer. */
 static inline u32
 hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
@@ -180,15 +134,6 @@ hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
 	ring_info->ring_buffer->read_index = next_read_location;
 }
 
-
-/* Get the start of the ring buffer. */
-static inline void *
-hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
-{
-	return (void *)ring_info->ring_buffer->buffer;
-}
-
-
 /* Get the size of the ring buffer. */
 static inline u32
 hv_get_ring_buffersize(struct hv_ring_buffer_info *ring_info)
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 40fd608475f7..eb7c0b215ba4 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1366,4 +1366,58 @@ extern __u32 vmbus_proto_version;
 int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,
 				  const uuid_le *shv_host_servie_id);
 void vmbus_set_event(struct vmbus_channel *channel);
+
+/* Get the start of the ring buffer. */
+static inline void *
+hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
+{
+	return (void *)ring_info->ring_buffer->buffer;
+}
+
+/*
+ * To optimize the flow management on the send-side,
+ * when the sender is blocked because of lack of
+ * sufficient space in the ring buffer, potential the
+ * consumer of the ring buffer can signal the producer.
+ * This is controlled by the following parameters:
+ *
+ * 1. pending_send_sz: This is the size in bytes that the
+ *    producer is trying to send.
+ * 2. The feature bit feat_pending_send_sz set to indicate if
+ *    the consumer of the ring will signal when the ring
+ *    state transitions from being full to a state where
+ *    there is room for the producer to send the pending packet.
+ */
+
+static inline  bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
+{
+	u32 cur_write_sz;
+	u32 pending_sz;
+
+	/*
+	 * Issue a full memory barrier before making the signaling decision.
+	 * Here is the reason for having this barrier:
+	 * If the reading of the pend_sz (in this function)
+	 * were to be reordered and read before we commit the new read
+	 * index (in the calling function)  we could
+	 * have a problem. If the host were to set the pending_sz after we
+	 * have sampled pending_sz and go to sleep before we commit the
+	 * read index, we could miss sending the interrupt. Issue a full
+	 * memory barrier to address this.
+	 */
+	virt_mb();
+
+	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
+	/* If the other end is not blocked on write don't bother. */
+	if (pending_sz == 0)
+		return false;
+
+	cur_write_sz = hv_get_bytes_to_write(rbi);
+
+	if (cur_write_sz >= pending_sz)
+		return true;
+
+	return false;
+}
+
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From ab028db41ca9174caab7f9e3fc0a2e7f4a418410 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:51 -0700
Subject: Drivers: hv: vmbus: Implement APIs to support "in place" consumption
 of vmbus packets

Implement APIs for in-place consumption of vmbus packets. Currently, each
packet is copied and processed one at a time and as part of processing
each packet we potentially may signal the host (if it is waiting for
room to produce a packet).

These APIs help batched in-place processing of vmbus packets.
We also optimize host signaling by having a separate API to signal
the end of in-place consumption. With netvsc using these APIs,
on an iperf run on average I see about 20X reduction in checks to
signal the host.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c |  1 +
 include/linux/hyperv.h   | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index dd255c9b9420..fe586bf74e17 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -132,6 +132,7 @@ hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
 		    u32 next_read_location)
 {
 	ring_info->ring_buffer->read_index = next_read_location;
+	ring_info->priv_read_index = next_read_location;
 }
 
 /* Get the size of the ring buffer. */
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index eb7c0b215ba4..590fee6a2e6f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -126,6 +126,8 @@ struct hv_ring_buffer_info {
 
 	u32 ring_datasize;		/* < ring_size */
 	u32 ring_data_startoffset;
+	u32 priv_write_index;
+	u32 priv_read_index;
 };
 
 /*
@@ -1420,4 +1422,88 @@ static inline  bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 	return false;
 }
 
+/*
+ * An API to support in-place processing of incoming VMBUS packets.
+ */
+#define VMBUS_PKT_TRAILER	8
+
+static inline struct vmpacket_descriptor *
+get_next_pkt_raw(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	u32 read_loc = ring_info->priv_read_index;
+	void *ring_buffer = hv_get_ring_buffer(ring_info);
+	struct vmpacket_descriptor *cur_desc;
+	u32 packetlen;
+	u32 dsize = ring_info->ring_datasize;
+	u32 delta = read_loc - ring_info->ring_buffer->read_index;
+	u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta);
+
+	if (bytes_avail_toread < sizeof(struct vmpacket_descriptor))
+		return NULL;
+
+	if ((read_loc + sizeof(*cur_desc)) > dsize)
+		return NULL;
+
+	cur_desc = ring_buffer + read_loc;
+	packetlen = cur_desc->len8 << 3;
+
+	/*
+	 * If the packet under consideration is wrapping around,
+	 * return failure.
+	 */
+	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1))
+		return NULL;
+
+	return cur_desc;
+}
+
+/*
+ * A helper function to step through packets "in-place"
+ * This API is to be called after each successful call
+ * get_next_pkt_raw().
+ */
+static inline void put_pkt_raw(struct vmbus_channel *channel,
+				struct vmpacket_descriptor *desc)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	u32 read_loc = ring_info->priv_read_index;
+	u32 packetlen = desc->len8 << 3;
+	u32 dsize = ring_info->ring_datasize;
+
+	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize)
+		BUG();
+	/*
+	 * Include the packet trailer.
+	 */
+	ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
+}
+
+/*
+ * This call commits the read index and potentially signals the host.
+ * Here is the pattern for using the "in-place" consumption APIs:
+ *
+ * while (get_next_pkt_raw() {
+ *	process the packet "in-place";
+ *	put_pkt_raw();
+ * }
+ * if (packets processed in place)
+ *	commit_rd_index();
+ */
+static inline void commit_rd_index(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	/*
+	 * Make sure all reads are done before we update the read index since
+	 * the writer may start writing to the read area once the read index
+	 * is updated.
+	 */
+	virt_rmb();
+	ring_info->ring_buffer->read_index = ring_info->priv_read_index;
+
+	if (hv_need_to_signal_on_read(ring_info))
+		vmbus_set_event(channel);
+}
+
+
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 97fb77dc87582300fa3c141b63699f853576cab1 Mon Sep 17 00:00:00 2001
From: Jake Oshins <jakeo@microsoft.com>
Date: Tue, 5 Apr 2016 10:22:51 -0700
Subject: drivers:hv: Make a function to free mmio regions through vmbus

This patch introduces a function that reverses everything
done by vmbus_allocate_mmio().  Existing code just called
release_mem_region().  Future patches in this series
require a more complex sequence of actions, so this function
is introduced to wrap those actions.

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c | 15 +++++++++++++++
 include/linux/hyperv.h |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 799518b3cdc5..60553c156f90 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1187,6 +1187,21 @@ exit:
 }
 EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
 
+/**
+ * vmbus_free_mmio() - Free a memory-mapped I/O range.
+ * @start:		Base address of region to release.
+ * @size:		Size of the range to be allocated
+ *
+ * This function releases anything requested by
+ * vmbus_mmio_allocate().
+ */
+void vmbus_free_mmio(resource_size_t start, resource_size_t size)
+{
+	release_mem_region(start, size);
+
+}
+EXPORT_SYMBOL_GPL(vmbus_free_mmio);
+
 /**
  * vmbus_cpu_number_to_vp_number() - Map CPU to VP.
  * @cpu_number: CPU number in Linux terms
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 590fee6a2e6f..b10954a66939 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1120,7 +1120,7 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
 			resource_size_t min, resource_size_t max,
 			resource_size_t size, resource_size_t align,
 			bool fb_overlap_ok);
-
+void vmbus_free_mmio(resource_size_t start, resource_size_t size);
 int vmbus_cpu_number_to_vp_number(int cpu_number);
 u64 hv_do_hypercall(u64 control, void *input, void *output);
 
-- 
cgit v1.2.3


From 05d1a717ec0430c916a749b94eb90ab74bbfa356 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Mon, 29 Feb 2016 19:52:05 -0500
Subject: ima: add support for creating files using the mknodat syscall

Commit 3034a14 "ima: pass 'opened' flag to identify newly created files"
stopped identifying empty files as new files.  However new empty files
can be created using the mknodat syscall.  On systems with IMA-appraisal
enabled, these empty files are not labeled with security.ima extended
attributes properly, preventing them from subsequently being opened in
order to write the file data contents.  This patch defines a new hook
named ima_post_path_mknod() to mark these empty files, created using
mknodat, as new in order to allow the file data contents to be written.

In addition, files with security.ima xattrs containing a file signature
are considered "immutable" and can not be modified.  The file contents
need to be written, before signing the file.  This patch relaxes this
requirement for new files, allowing the file signature to be written
before the file contents.

Changelog:
- defer identifying files with signatures stored as security.ima
  (based on Dmitry Rozhkov's comments)
- removing tests (eg. dentry, dentry->d_inode, inode->i_size == 0)
  (based on Al's review)

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Al Viro <<viro@zeniv.linux.org.uk>
Tested-by: Dmitry Rozhkov <dmitry.rozhkov@linux.intel.com>
---
 fs/namei.c                            |  2 ++
 include/linux/ima.h                   |  6 ++++++
 security/integrity/ima/ima_appraise.c |  5 +++++
 security/integrity/ima/ima_main.c     | 25 ++++++++++++++++++++++++-
 4 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 1d9ca2d5dff6..b4bd06839446 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3608,6 +3608,8 @@ retry:
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
 			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+			if (!error)
+				ima_post_path_mknod(dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
 			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
diff --git a/include/linux/ima.h b/include/linux/ima.h
index e6516cbbe9bf..0eb7c2e7f0d6 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -21,6 +21,7 @@ extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_read_file(struct file *file, enum kernel_read_file_id id);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
+extern void ima_post_path_mknod(struct dentry *dentry);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -54,6 +55,11 @@ static inline int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	return 0;
 }
 
+static inline void ima_post_path_mknod(struct dentry *dentry)
+{
+	return;
+}
+
 #endif /* CONFIG_IMA */
 
 #ifdef CONFIG_IMA_APPRAISE
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index d2f28a0c8614..1bcbc12e03d9 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -275,6 +275,11 @@ out:
 		     xattr_value->type != EVM_IMA_XATTR_DIGSIG)) {
 			if (!ima_fix_xattr(dentry, iint))
 				status = INTEGRITY_PASS;
+		} else if ((inode->i_size == 0) &&
+			   (iint->flags & IMA_NEW_FILE) &&
+			   (xattr_value &&
+			    xattr_value->type == EVM_IMA_XATTR_DIGSIG)) {
+			status = INTEGRITY_PASS;
 		}
 		integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename,
 				    op, cause, rc, 0);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 391f41751021..68b26c340acd 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -246,7 +246,8 @@ static int process_measurement(struct file *file, char *buf, loff_t size,
 		ima_audit_measurement(iint, pathname);
 
 out_digsig:
-	if ((mask & MAY_WRITE) && (iint->flags & IMA_DIGSIG))
+	if ((mask & MAY_WRITE) && (iint->flags & IMA_DIGSIG) &&
+	     !(iint->flags & IMA_NEW_FILE))
 		rc = -EACCES;
 	kfree(xattr_value);
 out_free:
@@ -315,6 +316,28 @@ int ima_file_check(struct file *file, int mask, int opened)
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
+/**
+ * ima_post_path_mknod - mark as a new inode
+ * @dentry: newly created dentry
+ *
+ * Mark files created via the mknodat syscall as new, so that the
+ * file data can be written later.
+ */
+void ima_post_path_mknod(struct dentry *dentry)
+{
+	struct integrity_iint_cache *iint;
+	struct inode *inode = dentry->d_inode;
+	int must_appraise;
+
+	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	if (!must_appraise)
+		return;
+
+	iint = integrity_inode_get(inode);
+	if (iint)
+		iint->flags |= IMA_NEW_FILE;
+}
+
 /**
  * ima_read_file - pre-measure/appraise hook decision based on policy
  * @file: pointer to the file to be measured/appraised/audit
-- 
cgit v1.2.3


From 0f40fbbcc34e093255a2b2d70b6b0fb48c3f39aa Mon Sep 17 00:00:00 2001
From: Brian Bloniarz <brian.bloniarz@gmail.com>
Date: Sun, 6 Mar 2016 13:16:30 -0800
Subject: Fix OpenSSH pty regression on close

OpenSSH expects the (non-blocking) read() of pty master to return
EAGAIN only if it has received all of the slave-side output after
it has received SIGCHLD. This used to work on pre-3.12 kernels.

This fix effectively forces non-blocking read() and poll() to
block for parallel i/o to complete for all ttys. It also unwinds
these changes:

1) f8747d4a466ab2cafe56112c51b3379f9fdb7a12
   tty: Fix pty master read() after slave closes

2) 52bce7f8d4fc633c9a9d0646eef58ba6ae9a3b73
   pty, n_tty: Simplify input processing on final close

3) 1a48632ffed61352a7810ce089dc5a8bcd505a60
   pty: Fix input race when closing

Inspired by analysis and patch from Marc Aurele La France <tsi@tuyoix.net>

Reported-by: Volth <openssh@volth.com>
Reported-by: Marc Aurele La France <tsi@tuyoix.net>
BugLink: https://bugzilla.mindrot.org/show_bug.cgi?id=52
BugLink: https://bugzilla.mindrot.org/show_bug.cgi?id=2492
Signed-off-by: Brian Bloniarz <brian.bloniarz@gmail.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/serial/tty.txt |  3 --
 drivers/tty/n_hdlc.c         |  4 +--
 drivers/tty/n_tty.c          | 70 +++++++++++++++++++++-----------------------
 drivers/tty/pty.c            |  4 +--
 drivers/tty/tty_buffer.c     | 34 ++++-----------------
 include/linux/tty.h          |  2 +-
 6 files changed, 43 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt
index 798cba82c762..b48780977a68 100644
--- a/Documentation/serial/tty.txt
+++ b/Documentation/serial/tty.txt
@@ -210,9 +210,6 @@ TTY_IO_ERROR		If set, causes all subsequent userspace read/write
 
 TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
 
-TTY_OTHER_DONE		Device is a pty and the other side has closed and
-			all pending input processing has been completed.
-
 TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
 			smaller chunks.
 
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index bcaba17688f6..a7fa016f31eb 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -599,7 +599,7 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 	add_wait_queue(&tty->read_wait, &wait);
 
 	for (;;) {
-		if (test_bit(TTY_OTHER_DONE, &tty->flags)) {
+		if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
 			ret = -EIO;
 			break;
 		}
@@ -827,7 +827,7 @@ static unsigned int n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp,
 		/* set bits for operations that won't block */
 		if (n_hdlc->rx_buf_list.head)
 			mask |= POLLIN | POLLRDNORM;	/* readable */
-		if (test_bit(TTY_OTHER_DONE, &tty->flags))
+		if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
 			mask |= POLLHUP;
 		if (tty_hung_up_p(filp))
 			mask |= POLLHUP;
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index fb76a7d80e7e..bdf0e6e89991 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1917,18 +1917,6 @@ static inline int input_available_p(struct tty_struct *tty, int poll)
 		return ldata->commit_head - ldata->read_tail >= amt;
 }
 
-static inline int check_other_done(struct tty_struct *tty)
-{
-	int done = test_bit(TTY_OTHER_DONE, &tty->flags);
-	if (done) {
-		/* paired with cmpxchg() in check_other_closed(); ensures
-		 * read buffer head index is not stale
-		 */
-		smp_mb__after_atomic();
-	}
-	return done;
-}
-
 /**
  *	copy_from_read_buf	-	copy read data directly
  *	@tty: terminal device
@@ -2124,7 +2112,7 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 	struct n_tty_data *ldata = tty->disc_data;
 	unsigned char __user *b = buf;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int c, done;
+	int c;
 	int minimum, time;
 	ssize_t retval = 0;
 	long timeout;
@@ -2183,32 +2171,35 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 			break;
 		}
 
-		done = check_other_done(tty);
-
 		if (!input_available_p(tty, 0)) {
-			if (done) {
-				retval = -EIO;
-				break;
-			}
-			if (tty_hung_up_p(file))
-				break;
-			if (!timeout)
-				break;
-			if (file->f_flags & O_NONBLOCK) {
-				retval = -EAGAIN;
-				break;
-			}
-			if (signal_pending(current)) {
-				retval = -ERESTARTSYS;
-				break;
-			}
 			up_read(&tty->termios_rwsem);
+			tty_buffer_flush_work(tty->port);
+			down_read(&tty->termios_rwsem);
+			if (!input_available_p(tty, 0)) {
+				if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
+					retval = -EIO;
+					break;
+				}
+				if (tty_hung_up_p(file))
+					break;
+				if (!timeout)
+					break;
+				if (file->f_flags & O_NONBLOCK) {
+					retval = -EAGAIN;
+					break;
+				}
+				if (signal_pending(current)) {
+					retval = -ERESTARTSYS;
+					break;
+				}
+				up_read(&tty->termios_rwsem);
 
-			timeout = wait_woken(&wait, TASK_INTERRUPTIBLE,
-					     timeout);
+				timeout = wait_woken(&wait, TASK_INTERRUPTIBLE,
+						timeout);
 
-			down_read(&tty->termios_rwsem);
-			continue;
+				down_read(&tty->termios_rwsem);
+				continue;
+			}
 		}
 
 		if (ldata->icanon && !L_EXTPROC(tty)) {
@@ -2386,12 +2377,17 @@ static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
 
 	poll_wait(file, &tty->read_wait, wait);
 	poll_wait(file, &tty->write_wait, wait);
-	if (check_other_done(tty))
-		mask |= POLLHUP;
 	if (input_available_p(tty, 1))
 		mask |= POLLIN | POLLRDNORM;
+	else {
+		tty_buffer_flush_work(tty->port);
+		if (input_available_p(tty, 1))
+			mask |= POLLIN | POLLRDNORM;
+	}
 	if (tty->packet && tty->link->ctrl_status)
 		mask |= POLLPRI | POLLIN | POLLRDNORM;
+	if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
+		mask |= POLLHUP;
 	if (tty_hung_up_p(file))
 		mask |= POLLHUP;
 	if (tty->ops->write && !tty_is_writelocked(tty) &&
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index a8a292fd564f..ee0e84798399 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -59,7 +59,7 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 	if (!tty->link)
 		return;
 	set_bit(TTY_OTHER_CLOSED, &tty->link->flags);
-	tty_flip_buffer_push(tty->link->port);
+	wake_up_interruptible(&tty->link->read_wait);
 	wake_up_interruptible(&tty->link->write_wait);
 	if (tty->driver->subtype == PTY_TYPE_MASTER) {
 		set_bit(TTY_OTHER_CLOSED, &tty->flags);
@@ -247,9 +247,7 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
 		goto out;
 
 	clear_bit(TTY_IO_ERROR, &tty->flags);
-	/* TTY_OTHER_CLOSED must be cleared before TTY_OTHER_DONE */
 	clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
-	clear_bit(TTY_OTHER_DONE, &tty->link->flags);
 	set_bit(TTY_THROTTLED, &tty->flags);
 	return 0;
 
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index a946e49a2626..aa80dc94ddc2 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -37,29 +37,6 @@
 
 #define TTY_BUFFER_PAGE	(((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~0xFF)
 
-/*
- * If all tty flip buffers have been processed by flush_to_ldisc() or
- * dropped by tty_buffer_flush(), check if the linked pty has been closed.
- * If so, wake the reader/poll to process
- */
-static inline void check_other_closed(struct tty_struct *tty)
-{
-	unsigned long flags, old;
-
-	/* transition from TTY_OTHER_CLOSED => TTY_OTHER_DONE must be atomic */
-	for (flags = ACCESS_ONCE(tty->flags);
-	     test_bit(TTY_OTHER_CLOSED, &flags);
-	     ) {
-		old = flags;
-		__set_bit(TTY_OTHER_DONE, &flags);
-		flags = cmpxchg(&tty->flags, old, flags);
-		if (old == flags) {
-			wake_up_interruptible(&tty->read_wait);
-			break;
-		}
-	}
-}
-
 /**
  *	tty_buffer_lock_exclusive	-	gain exclusive access to buffer
  *	tty_buffer_unlock_exclusive	-	release exclusive access
@@ -254,8 +231,6 @@ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld)
 	if (ld && ld->ops->flush_buffer)
 		ld->ops->flush_buffer(tty);
 
-	check_other_closed(tty);
-
 	atomic_dec(&buf->priority);
 	mutex_unlock(&buf->lock);
 }
@@ -522,10 +497,8 @@ static void flush_to_ldisc(struct work_struct *work)
 		 */
 		count = smp_load_acquire(&head->commit) - head->read;
 		if (!count) {
-			if (next == NULL) {
-				check_other_closed(tty);
+			if (next == NULL)
 				break;
-			}
 			buf->head = next;
 			tty_buffer_free(port, head);
 			continue;
@@ -614,3 +587,8 @@ bool tty_buffer_cancel_work(struct tty_port *port)
 {
 	return cancel_work_sync(&port->buf.work);
 }
+
+void tty_buffer_flush_work(struct tty_port *port)
+{
+	flush_work(&port->buf.work);
+}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bf1bcdb01df0..d82bb9281d12 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -351,7 +351,6 @@ struct tty_file_private {
 #define TTY_OTHER_CLOSED 	2	/* Other side (if any) has closed */
 #define TTY_EXCLUSIVE 		3	/* Exclusive open mode */
 #define TTY_DO_WRITE_WAKEUP 	5	/* Call write_wakeup after queuing new */
-#define TTY_OTHER_DONE		6	/* Closed pty has completed input processing */
 #define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
 #define TTY_PTY_LOCK 		16	/* pty private */
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
@@ -480,6 +479,7 @@ extern void tty_buffer_init(struct tty_port *port);
 extern void tty_buffer_set_lock_subclass(struct tty_port *port);
 extern bool tty_buffer_restart_work(struct tty_port *port);
 extern bool tty_buffer_cancel_work(struct tty_port *port);
+extern void tty_buffer_flush_work(struct tty_port *port);
 extern speed_t tty_termios_baud_rate(struct ktermios *termios);
 extern speed_t tty_termios_input_baud_rate(struct ktermios *termios);
 extern void tty_termios_encode_baud_rate(struct ktermios *termios,
-- 
cgit v1.2.3


From 795ddd18d38f9762fbfefceab9aa16caef0cf431 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Sun, 24 Apr 2016 20:28:05 +0100
Subject: nvmem: core: remove regmap dependency

nvmem uses regmap_raw_read/write apis to read/write data from providers,
regmap raw apis stopped working with recent kernels which removed raw
accessors on mmio bus. This resulted in broken nvmem for providers
which are based on regmap mmio bus. This issue can be fixed temporarly
by moving to other regmap apis, but we might hit same issue in future.
Moving to interfaces based on read/write callbacks from providers would
be more robust.

This patch removes regmap dependency from nvmem and introduces
read/write callbacks from the providers.

Without this patch nvmem providers like qfprom based on regmap mmio
bus would not work.

Reported-by: Rajendra Nayak <rjendra@qti.qualcomm.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/Kconfig          |  1 -
 drivers/nvmem/core.c           | 67 +++++++++++++++++++++++++-----------------
 include/linux/nvmem-provider.h | 10 +++++++
 3 files changed, 50 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig
index 9c0c59d3b22b..15c58a5ff7ec 100644
--- a/drivers/nvmem/Kconfig
+++ b/drivers/nvmem/Kconfig
@@ -1,6 +1,5 @@
 menuconfig NVMEM
 	tristate "NVMEM Support"
-	select REGMAP
 	help
 	  Support for NVMEM(Non Volatile Memory) devices like EEPROM, EFUSES...
 
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 0de3d878c439..bb4ea123547f 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -23,12 +23,10 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/nvmem-provider.h>
 #include <linux/of.h>
-#include <linux/regmap.h>
 #include <linux/slab.h>
 
 struct nvmem_device {
 	const char		*name;
-	struct regmap		*regmap;
 	struct module		*owner;
 	struct device		dev;
 	int			stride;
@@ -41,6 +39,9 @@ struct nvmem_device {
 	int			flags;
 	struct bin_attribute	eeprom;
 	struct device		*base_dev;
+	nvmem_reg_read_t	reg_read;
+	nvmem_reg_write_t	reg_write;
+	void *priv;
 };
 
 #define FLAG_COMPAT		BIT(0)
@@ -66,6 +67,23 @@ static struct lock_class_key eeprom_lock_key;
 #endif
 
 #define to_nvmem_device(d) container_of(d, struct nvmem_device, dev)
+static int nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
+			  void *val, size_t bytes)
+{
+	if (nvmem->reg_read)
+		return nvmem->reg_read(nvmem->priv, offset, val, bytes);
+
+	return -EINVAL;
+}
+
+static int nvmem_reg_write(struct nvmem_device *nvmem, unsigned int offset,
+			   void *val, size_t bytes)
+{
+	if (nvmem->reg_write)
+		return nvmem->reg_write(nvmem->priv, offset, val, bytes);
+
+	return -EINVAL;
+}
 
 static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *attr,
@@ -93,7 +111,7 @@ static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
 
 	count = round_down(count, nvmem->word_size);
 
-	rc = regmap_raw_read(nvmem->regmap, pos, buf, count);
+	rc = nvmem_reg_read(nvmem, pos, buf, count);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -127,7 +145,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj,
 
 	count = round_down(count, nvmem->word_size);
 
-	rc = regmap_raw_write(nvmem->regmap, pos, buf, count);
+	rc = nvmem_reg_write(nvmem, pos, buf, count);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -421,18 +439,11 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 {
 	struct nvmem_device *nvmem;
 	struct device_node *np;
-	struct regmap *rm;
 	int rval;
 
 	if (!config->dev)
 		return ERR_PTR(-EINVAL);
 
-	rm = dev_get_regmap(config->dev, NULL);
-	if (!rm) {
-		dev_err(config->dev, "Regmap not found\n");
-		return ERR_PTR(-EINVAL);
-	}
-
 	nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL);
 	if (!nvmem)
 		return ERR_PTR(-ENOMEM);
@@ -444,14 +455,16 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	}
 
 	nvmem->id = rval;
-	nvmem->regmap = rm;
 	nvmem->owner = config->owner;
-	nvmem->stride = regmap_get_reg_stride(rm);
-	nvmem->word_size = regmap_get_val_bytes(rm);
-	nvmem->size = regmap_get_max_register(rm) + nvmem->stride;
+	nvmem->stride = config->stride;
+	nvmem->word_size = config->word_size;
+	nvmem->size = config->size;
 	nvmem->dev.type = &nvmem_provider_type;
 	nvmem->dev.bus = &nvmem_bus_type;
 	nvmem->dev.parent = config->dev;
+	nvmem->priv = config->priv;
+	nvmem->reg_read = config->reg_read;
+	nvmem->reg_write = config->reg_write;
 	np = config->dev->of_node;
 	nvmem->dev.of_node = np;
 	dev_set_name(&nvmem->dev, "%s%d",
@@ -948,7 +961,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	rc = regmap_raw_read(nvmem->regmap, cell->offset, buf, cell->bytes);
+	rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->bytes);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -977,7 +990,7 @@ void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
 	u8 *buf;
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return ERR_PTR(-EINVAL);
 
 	buf = kzalloc(cell->bytes, GFP_KERNEL);
@@ -1014,7 +1027,7 @@ static inline void *nvmem_cell_prepare_write_buffer(struct nvmem_cell *cell,
 		*b <<= bit_offset;
 
 		/* setup the first byte with lsb bits from nvmem */
-		rc = regmap_raw_read(nvmem->regmap, cell->offset, &v, 1);
+		rc = nvmem_reg_read(nvmem, cell->offset, &v, 1);
 		*b++ |= GENMASK(bit_offset - 1, 0) & v;
 
 		/* setup rest of the byte if any */
@@ -1031,7 +1044,7 @@ static inline void *nvmem_cell_prepare_write_buffer(struct nvmem_cell *cell,
 	/* if it's not end on byte boundary */
 	if ((nbits + bit_offset) % BITS_PER_BYTE) {
 		/* setup the last byte with msb bits from nvmem */
-		rc = regmap_raw_read(nvmem->regmap,
+		rc = nvmem_reg_read(nvmem,
 				    cell->offset + cell->bytes - 1, &v, 1);
 		*p |= GENMASK(7, (nbits + bit_offset) % BITS_PER_BYTE) & v;
 
@@ -1054,7 +1067,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 	struct nvmem_device *nvmem = cell->nvmem;
 	int rc;
 
-	if (!nvmem || !nvmem->regmap || nvmem->read_only ||
+	if (!nvmem || nvmem->read_only ||
 	    (cell->bit_offset == 0 && len != cell->bytes))
 		return -EINVAL;
 
@@ -1064,7 +1077,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 			return PTR_ERR(buf);
 	}
 
-	rc = regmap_raw_write(nvmem->regmap, cell->offset, buf, cell->bytes);
+	rc = nvmem_reg_write(nvmem, cell->offset, buf, cell->bytes);
 
 	/* free the tmp buffer */
 	if (cell->bit_offset || cell->nbits)
@@ -1094,7 +1107,7 @@ ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
 	int rc;
 	ssize_t len;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
 	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
@@ -1124,7 +1137,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem,
 	struct nvmem_cell cell;
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
 	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
@@ -1152,10 +1165,10 @@ int nvmem_device_read(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
-	rc = regmap_raw_read(nvmem->regmap, offset, buf, bytes);
+	rc = nvmem_reg_read(nvmem, offset, buf, bytes);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -1180,10 +1193,10 @@ int nvmem_device_write(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
-	rc = regmap_raw_write(nvmem->regmap, offset, buf, bytes);
+	rc = nvmem_reg_write(nvmem, offset, buf, bytes);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index a4fcc90b0f20..cd93416d762e 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -14,6 +14,10 @@
 
 struct nvmem_device;
 struct nvmem_cell_info;
+typedef int (*nvmem_reg_read_t)(void *priv, unsigned int offset,
+				void *val, size_t bytes);
+typedef int (*nvmem_reg_write_t)(void *priv, unsigned int offset,
+				 void *val, size_t bytes);
 
 struct nvmem_config {
 	struct device		*dev;
@@ -24,6 +28,12 @@ struct nvmem_config {
 	int			ncells;
 	bool			read_only;
 	bool			root_only;
+	nvmem_reg_read_t	reg_read;
+	nvmem_reg_write_t	reg_write;
+	int	size;
+	int	word_size;
+	int	stride;
+	void	*priv;
 	/* To be only used by old driver/misc/eeprom drivers */
 	bool			compat;
 	struct device		*base_dev;
-- 
cgit v1.2.3


From 1f62ff34a90471d1b735bac2c79e894afc7c59bc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Mar 2016 22:19:40 +0100
Subject: driver-core: use 'dev' argument in dev_dbg_ratelimited stub

dev_dbg_ratelimited() is a macro that ignores its first argument when DEBUG is
not set, which can lead to unused variable warnings:

ethernet/mellanox/mlxsw/pci.c: In function 'mlxsw_pci_cqe_sdq_handle':
ethernet/mellanox/mlxsw/pci.c:646:18: warning: unused variable 'pdev' [-Wunused-variable]
ethernet/mellanox/mlxsw/pci.c: In function 'mlxsw_pci_cqe_rdq_handle':
ethernet/mellanox/mlxsw/pci.c:671:18: warning: unused variable 'pdev' [-Wunused-variable]

The macro already ensures that all its other arguments are silently
ignored by the compiler without triggering a warning, through the
use of the no_printk() macro, but the dev argument is not passed into
that.

This changes the definition to use the same trick as no_printk() with
an if(0) that leads the compiler to not evaluate the side-effects but
still see that 'dev' might not be unused.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Fixes: 6f586e663e3b ("driver-core: Shut up dev_dbg_reatelimited() without DEBUG")
Reviewed-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 002c59728dbe..07f74c246cac 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1293,8 +1293,11 @@ do {									\
 		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
 } while (0)
 #else
-#define dev_dbg_ratelimited(dev, fmt, ...)			\
-	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#define dev_dbg_ratelimited(dev, fmt, ...)				\
+do {									\
+	if (0)								\
+		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
+} while (0)
 #endif
 
 #ifdef VERBOSE_DEBUG
-- 
cgit v1.2.3


From 8ac0fba2da41620f4931a1007c71b0d4723eb02a Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Sun, 1 May 2016 17:50:29 -0400
Subject: isa: Decouple X86_32 dependency from the ISA Kconfig option

The introduction of the ISA_BUS option blocks the compilation of ISA
drivers on non-x86 platforms. The ISA_BUS configuration option should
not be necessary if the X86_32 dependency can be decoupled from the ISA
configuration option. This patch both removes the ISA_BUS configuration
option entirely and removes the X86_32 dependency from the ISA
configuration option.

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig      | 10 ++--------
 drivers/base/Makefile |  2 +-
 include/linux/isa.h   |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a5977986f38b..280e5ebae2f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2472,16 +2472,8 @@ config ISA_DMA_API
 	  Enables ISA-style DMA support for devices requiring such controllers.
 	  If unsure, say Y.
 
-config ISA_BUS
-	bool "ISA bus support"
-	help
-	  Enables ISA bus support for devices requiring such controllers.
-
-if X86_32
-
 config ISA
 	bool "ISA support"
-	depends on ISA_BUS
 	---help---
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -2489,6 +2481,8 @@ config ISA
 	  (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
 	  newer boards don't support it.  If you have ISA, say Y, otherwise N.
 
+if X86_32
+
 config EISA
 	bool "EISA support"
 	depends on ISA
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 4ebfb81cc7e9..6b2a84e7f2be 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_ISA_BUS)	+= isa.o
+obj-$(CONFIG_ISA)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
diff --git a/include/linux/isa.h b/include/linux/isa.h
index 2a02862775eb..b0270e3814c8 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,7 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
-#ifdef CONFIG_ISA_BUS
+#ifdef CONFIG_ISA
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
 #else
-- 
cgit v1.2.3


From 2e65060e803e046fc9b5ed0107494a452424845e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 27 Apr 2016 14:15:38 +0300
Subject: dmaengine: dw: revisit data_width property

There several changes are done here:

- Convert the property to be in bytes

  Besides that this is a common practice for such property, the use of a value
  in bytes much more convenient than handling the encoded one.

- Rename data_width to data-width in the device tree bindings

  The change leaves the support for the old format as well just in case someone
  will use a newer kernel with an old device tree blob.

- While here, replace dwc_fast_ffs() by __ffs()

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/snps-dma.txt |  7 +++-
 arch/arc/boot/dts/abilis_tb10x.dtsi                |  2 +-
 arch/arm/boot/dts/spear13xx.dtsi                   |  4 +--
 drivers/dma/dw/core.c                              | 42 ++++++----------------
 drivers/dma/dw/platform.c                          |  5 ++-
 include/linux/platform_data/dma-dw.h               |  2 +-
 6 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/snps-dma.txt b/Documentation/devicetree/bindings/dma/snps-dma.txt
index c99c1ffac199..0f5583293c9c 100644
--- a/Documentation/devicetree/bindings/dma/snps-dma.txt
+++ b/Documentation/devicetree/bindings/dma/snps-dma.txt
@@ -13,6 +13,11 @@ Required properties:
 - chan_priority: priority of channels. 0 (default): increase from chan 0->n, 1:
   increase from chan n->0
 - block_size: Maximum block size supported by the controller
+- data-width: Maximum data width supported by hardware per AHB master
+  (in bytes, power of 2)
+
+
+Deprecated properties:
 - data_width: Maximum data width supported by hardware per AHB master
   (0 - 8bits, 1 - 16bits, ..., 5 - 256bits)
 
@@ -38,7 +43,7 @@ Example:
 		chan_allocation_order = <1>;
 		chan_priority = <1>;
 		block_size = <0xfff>;
-		data_width = <3 3>;
+		data-width = <8 8>;
 	};
 
 DMA clients connected to the Designware DMA controller must use the format
diff --git a/arch/arc/boot/dts/abilis_tb10x.dtsi b/arch/arc/boot/dts/abilis_tb10x.dtsi
index cfb5052239a1..2f53bedb0cde 100644
--- a/arch/arc/boot/dts/abilis_tb10x.dtsi
+++ b/arch/arc/boot/dts/abilis_tb10x.dtsi
@@ -112,7 +112,7 @@
 			chan_allocation_order = <0>;
 			chan_priority = <1>;
 			block_size = <0x7ff>;
-			data_width = <2>;
+			data-width = <4>;
 			clocks = <&ahb_clk>;
 			clock-names = "hclk";
 		};
diff --git a/arch/arm/boot/dts/spear13xx.dtsi b/arch/arm/boot/dts/spear13xx.dtsi
index 14594ce8c18a..449acf0d8272 100644
--- a/arch/arm/boot/dts/spear13xx.dtsi
+++ b/arch/arm/boot/dts/spear13xx.dtsi
@@ -117,7 +117,7 @@
 			chan_priority = <1>;
 			block_size = <0xfff>;
 			dma-masters = <2>;
-			data_width = <3 3>;
+			data-width = <8 8>;
 		};
 
 		dma@eb000000 {
@@ -133,7 +133,7 @@
 			chan_allocation_order = <1>;
 			chan_priority = <1>;
 			block_size = <0xfff>;
-			data_width = <3 3>;
+			data-width = <8 8>;
 		};
 
 		fsmc: flash@b0000000 {
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 78522dcf3a7d..992da255b8e6 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -162,21 +162,6 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
 
 /*----------------------------------------------------------------------*/
 
-static inline unsigned int dwc_fast_ffs(unsigned long long v)
-{
-	/*
-	 * We can be a lot more clever here, but this should take care
-	 * of the most common optimization.
-	 */
-	if (!(v & 7))
-		return 3;
-	else if (!(v & 3))
-		return 2;
-	else if (!(v & 1))
-		return 1;
-	return 0;
-}
-
 static inline void dwc_dump_chan_regs(struct dw_dma_chan *dwc)
 {
 	dev_err(chan2dev(&dwc->chan),
@@ -677,11 +662,12 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	struct dw_desc		*prev;
 	size_t			xfer_count;
 	size_t			offset;
+	u8			m_master = dwc->m_master;
 	unsigned int		src_width;
 	unsigned int		dst_width;
-	unsigned int		data_width;
+	unsigned int		data_width = dw->data_width[m_master];
 	u32			ctllo;
-	u8			lms = DWC_LLP_LMS(dwc->m_master);
+	u8			lms = DWC_LLP_LMS(m_master);
 
 	dev_vdbg(chan2dev(chan),
 			"%s: d%pad s%pad l0x%zx f0x%lx\n", __func__,
@@ -694,10 +680,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 	dwc->direction = DMA_MEM_TO_MEM;
 
-	data_width = dw->data_width[dwc->m_master];
-
-	src_width = dst_width = min_t(unsigned int, data_width,
-				      dwc_fast_ffs(src | dest | len));
+	src_width = dst_width = __ffs(data_width | src | dest | len);
 
 	ctllo = DWC_DEFAULT_CTLLO(chan)
 			| DWC_CTLL_DST_WIDTH(dst_width)
@@ -757,11 +740,12 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct dw_desc		*prev;
 	struct dw_desc		*first;
 	u32			ctllo;
-	u8			lms = DWC_LLP_LMS(dwc->m_master);
+	u8			m_master = dwc->m_master;
+	u8			lms = DWC_LLP_LMS(m_master);
 	dma_addr_t		reg;
 	unsigned int		reg_width;
 	unsigned int		mem_width;
-	unsigned int		data_width;
+	unsigned int		data_width = dw->data_width[m_master];
 	unsigned int		i;
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
@@ -787,8 +771,6 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
 			DWC_CTLL_FC(DW_DMA_FC_D_M2P);
 
-		data_width = dw->data_width[dwc->m_master];
-
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len, dlen, mem;
@@ -796,8 +778,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 			mem = sg_dma_address(sg);
 			len = sg_dma_len(sg);
 
-			mem_width = min_t(unsigned int,
-					  data_width, dwc_fast_ffs(mem | len));
+			mem_width = __ffs(data_width | mem | len);
 
 slave_sg_todev_fill_desc:
 			desc = dwc_desc_get(dwc);
@@ -843,8 +824,6 @@ slave_sg_todev_fill_desc:
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
 			DWC_CTLL_FC(DW_DMA_FC_D_P2M);
 
-		data_width = dw->data_width[dwc->m_master];
-
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len, dlen, mem;
@@ -852,8 +831,7 @@ slave_sg_todev_fill_desc:
 			mem = sg_dma_address(sg);
 			len = sg_dma_len(sg);
 
-			mem_width = min_t(unsigned int,
-					  data_width, dwc_fast_ffs(mem | len));
+			mem_width = __ffs(data_width | mem | len);
 
 slave_sg_fromdev_fill_desc:
 			desc = dwc_desc_get(dwc);
@@ -1500,7 +1478,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		pdata->nr_masters = (dw_params >> DW_PARAMS_NR_MASTER & 3) + 1;
 		for (i = 0; i < pdata->nr_masters; i++) {
 			pdata->data_width[i] =
-				(dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3) + 2;
+				4 << (dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3);
 		}
 		max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
 
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index e65ebe5ab88f..2420fb7267bc 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -138,9 +138,12 @@ dw_dma_parse_dt(struct platform_device *pdev)
 	if (!of_property_read_u32(np, "block_size", &tmp))
 		pdata->block_size = tmp;
 
-	if (!of_property_read_u32_array(np, "data_width", arr, nr_masters)) {
+	if (!of_property_read_u32_array(np, "data-width", arr, nr_masters)) {
 		for (tmp = 0; tmp < nr_masters; tmp++)
 			pdata->data_width[tmp] = arr[tmp];
+	} else if (!of_property_read_u32_array(np, "data_width", arr, nr_masters)) {
+		for (tmp = 0; tmp < nr_masters; tmp++)
+			pdata->data_width[tmp] = BIT(arr[tmp] & 0x07);
 	}
 
 	return pdata;
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index b881b978e486..ad768111c350 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -43,7 +43,7 @@ struct dw_dma_slave {
  * @block_size: Maximum block size supported by the controller
  * @nr_masters: Number of AHB masters supported by the controller
  * @data_width: Maximum data width supported by hardware per AHB master
- *		(0 - 8bits, 1 - 16bits, ..., 5 - 256bits)
+ *		(in bytes, power of 2)
  */
 struct dw_dma_platform_data {
 	unsigned int	nr_channels;
-- 
cgit v1.2.3


From 161c3d04aeca8a5bfffe3902786bdf0ccd8575c0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 27 Apr 2016 14:15:39 +0300
Subject: dmaengine: dw: keep entire platform data in struct dw_dma

Keep the entire platform data in the struct dw_dma.
It makes the driver a bit cleaner.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw/core.c                | 30 +++++++++++++++---------------
 drivers/dma/dw/platform.c            |  4 ++--
 drivers/dma/dw/regs.h                |  5 ++---
 include/linux/platform_data/dma-dw.h |  2 +-
 4 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 992da255b8e6..30843a17b438 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -665,7 +665,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	u8			m_master = dwc->m_master;
 	unsigned int		src_width;
 	unsigned int		dst_width;
-	unsigned int		data_width = dw->data_width[m_master];
+	unsigned int		data_width = dw->pdata->data_width[m_master];
 	u32			ctllo;
 	u8			lms = DWC_LLP_LMS(m_master);
 
@@ -745,7 +745,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	dma_addr_t		reg;
 	unsigned int		reg_width;
 	unsigned int		mem_width;
-	unsigned int		data_width = dw->data_width[m_master];
+	unsigned int		data_width = dw->pdata->data_width[m_master];
 	unsigned int		i;
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
@@ -1444,7 +1444,6 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 	struct dw_dma		*dw;
 	bool			autocfg = false;
 	unsigned int		dw_params;
-	unsigned int		max_blk_size = 0;
 	unsigned int		i;
 	int			err;
 
@@ -1452,6 +1451,10 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 	if (!dw)
 		return -ENOMEM;
 
+	dw->pdata = devm_kzalloc(chip->dev, sizeof(*dw->pdata), GFP_KERNEL);
+	if (!dw->pdata)
+		return -ENOMEM;
+
 	dw->regs = chip->regs;
 	chip->dw = dw;
 
@@ -1467,11 +1470,8 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 			goto err_pdata;
 		}
 
-		pdata = devm_kzalloc(chip->dev, sizeof(*pdata), GFP_KERNEL);
-		if (!pdata) {
-			err = -ENOMEM;
-			goto err_pdata;
-		}
+		/* Reassign the platform data pointer */
+		pdata = dw->pdata;
 
 		/* Get hardware configuration parameters */
 		pdata->nr_channels = (dw_params >> DW_PARAMS_NR_CHAN & 7) + 1;
@@ -1480,7 +1480,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 			pdata->data_width[i] =
 				4 << (dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3);
 		}
-		max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
+		pdata->block_size = dma_readl(dw, MAX_BLK_SIZE);
 
 		/* Fill platform data with the default values */
 		pdata->is_private = true;
@@ -1490,6 +1490,11 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 	} else if (pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
 		err = -EINVAL;
 		goto err_pdata;
+	} else {
+		memcpy(dw->pdata, pdata, sizeof(*dw->pdata));
+
+		/* Reassign the platform data pointer */
+		pdata = dw->pdata;
 	}
 
 	dw->chan = devm_kcalloc(chip->dev, pdata->nr_channels, sizeof(*dw->chan),
@@ -1499,11 +1504,6 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		goto err_pdata;
 	}
 
-	/* Get hardware configuration parameters */
-	dw->nr_masters = pdata->nr_masters;
-	for (i = 0; i < dw->nr_masters; i++)
-		dw->data_width[i] = pdata->data_width[i];
-
 	/* Calculate all channel mask before DMA setup */
 	dw->all_chan_mask = (1 << pdata->nr_channels) - 1;
 
@@ -1570,7 +1570,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 			 * up to 0x0a for 4095.
 			 */
 			dwc->block_size =
-				(4 << ((max_blk_size >> 4 * i) & 0xf)) - 1;
+				(4 << ((pdata->block_size >> 4 * i) & 0xf)) - 1;
 			dwc->nollp =
 				(dwc_params >> DWC_PARAMS_MBLK_EN & 0x1) == 0;
 		} else {
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 2420fb7267bc..0a49011633b9 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -47,8 +47,8 @@ static struct dma_chan *dw_dma_of_xlate(struct of_phandle_args *dma_spec,
 
 	if (WARN_ON(slave.src_id >= DW_DMA_MAX_NR_REQUESTS ||
 		    slave.dst_id >= DW_DMA_MAX_NR_REQUESTS ||
-		    slave.m_master >= dw->nr_masters ||
-		    slave.p_master >= dw->nr_masters))
+		    slave.m_master >= dw->pdata->nr_masters ||
+		    slave.p_master >= dw->pdata->nr_masters))
 		return NULL;
 
 	dma_cap_zero(cap);
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 0ab02eb23bfc..4b7bd7834046 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -281,9 +281,8 @@ struct dw_dma {
 	u8			all_chan_mask;
 	u8			in_use;
 
-	/* hardware configuration */
-	unsigned char		nr_masters;
-	unsigned char		data_width[DW_DMA_MAX_NR_MASTERS];
+	/* platform data */
+	struct dw_dma_platform_data	*pdata;
 };
 
 static inline struct dw_dma_regs __iomem *__dw_regs(struct dw_dma *dw)
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index ad768111c350..d15d8ba8cc24 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -55,7 +55,7 @@ struct dw_dma_platform_data {
 #define CHAN_PRIORITY_ASCENDING		0	/* chan0 highest */
 #define CHAN_PRIORITY_DESCENDING	1	/* chan7 highest */
 	unsigned char	chan_priority;
-	unsigned short	block_size;
+	unsigned int	block_size;
 	unsigned char	nr_masters;
 	unsigned char	data_width[DW_DMA_MAX_NR_MASTERS];
 };
-- 
cgit v1.2.3


From 3a14c66d43d018baed96ceb74f9ab548878c09b8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 27 Apr 2016 14:15:40 +0300
Subject: dmaengine: dw: pass platform data via struct dw_dma_chip

We pass struct dw_dma_chip to dw_dma_probe() anyway, thus we may use it to
pass a platform data as well.

While here, constify the source of the platform data.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/ata/sata_dwc_460ex.c          | 2 +-
 drivers/dma/dw/core.c                 | 9 +++++----
 drivers/dma/dw/pci.c                  | 5 +++--
 drivers/dma/dw/platform.c             | 5 +++--
 include/linux/dma/dw.h                | 5 ++++-
 sound/soc/intel/common/sst-firmware.c | 2 +-
 6 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 80bdcabc293f..2cb6f7e04b5c 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -1248,7 +1248,7 @@ static int sata_dwc_probe(struct platform_device *ofdev)
 	hsdev->dma->dev = &ofdev->dev;
 
 	/* Initialize AHB DMAC */
-	err = dw_dma_probe(hsdev->dma, NULL);
+	err = dw_dma_probe(hsdev->dma);
 	if (err)
 		goto error_dma_iomap;
 
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 30843a17b438..edf053f73a49 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1439,8 +1439,9 @@ EXPORT_SYMBOL(dw_dma_cyclic_free);
 
 /*----------------------------------------------------------------------*/
 
-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
+int dw_dma_probe(struct dw_dma_chip *chip)
 {
+	struct dw_dma_platform_data *pdata;
 	struct dw_dma		*dw;
 	bool			autocfg = false;
 	unsigned int		dw_params;
@@ -1460,7 +1461,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 
 	pm_runtime_get_sync(chip->dev);
 
-	if (!pdata) {
+	if (!chip->pdata) {
 		dw_params = dma_readl(dw, DW_PARAMS);
 		dev_dbg(chip->dev, "DW_PARAMS: 0x%08x\n", dw_params);
 
@@ -1487,11 +1488,11 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		pdata->is_memcpy = true;
 		pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING;
 		pdata->chan_priority = CHAN_PRIORITY_ASCENDING;
-	} else if (pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
+	} else if (chip->pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
 		err = -EINVAL;
 		goto err_pdata;
 	} else {
-		memcpy(dw->pdata, pdata, sizeof(*dw->pdata));
+		memcpy(dw->pdata, chip->pdata, sizeof(*dw->pdata));
 
 		/* Reassign the platform data pointer */
 		pdata = dw->pdata;
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index 358f9689a3f5..0ae6c3b1d34e 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -17,8 +17,8 @@
 
 static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 {
+	const struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
 	struct dw_dma_chip *chip;
-	struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
 	int ret;
 
 	ret = pcim_enable_device(pdev);
@@ -49,8 +49,9 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 	chip->dev = &pdev->dev;
 	chip->regs = pcim_iomap_table(pdev)[0];
 	chip->irq = pdev->irq;
+	chip->pdata = pdata;
 
-	ret = dw_dma_probe(chip, pdata);
+	ret = dw_dma_probe(chip);
 	if (ret)
 		return ret;
 
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 0a49011633b9..5bda0eb9f393 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -161,7 +161,7 @@ static int dw_probe(struct platform_device *pdev)
 	struct dw_dma_chip *chip;
 	struct device *dev = &pdev->dev;
 	struct resource *mem;
-	struct dw_dma_platform_data *pdata;
+	const struct dw_dma_platform_data *pdata;
 	int err;
 
 	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
@@ -186,6 +186,7 @@ static int dw_probe(struct platform_device *pdev)
 		pdata = dw_dma_parse_dt(pdev);
 
 	chip->dev = dev;
+	chip->pdata = pdata;
 
 	chip->clk = devm_clk_get(chip->dev, "hclk");
 	if (IS_ERR(chip->clk))
@@ -196,7 +197,7 @@ static int dw_probe(struct platform_device *pdev)
 
 	pm_runtime_enable(&pdev->dev);
 
-	err = dw_dma_probe(chip, pdata);
+	err = dw_dma_probe(chip);
 	if (err)
 		goto err_dw_dma_probe;
 
diff --git a/include/linux/dma/dw.h b/include/linux/dma/dw.h
index 71456442ebe3..f2e538aaddad 100644
--- a/include/linux/dma/dw.h
+++ b/include/linux/dma/dw.h
@@ -27,6 +27,7 @@ struct dw_dma;
  * @regs:		memory mapped I/O space
  * @clk:		hclk clock
  * @dw:			struct dw_dma that is filed by dw_dma_probe()
+ * @pdata:		pointer to platform data
  */
 struct dw_dma_chip {
 	struct device	*dev;
@@ -34,10 +35,12 @@ struct dw_dma_chip {
 	void __iomem	*regs;
 	struct clk	*clk;
 	struct dw_dma	*dw;
+
+	const struct dw_dma_platform_data	*pdata;
 };
 
 /* Export to the platform drivers */
-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata);
+int dw_dma_probe(struct dw_dma_chip *chip);
 int dw_dma_remove(struct dw_dma_chip *chip);
 
 /* DMA API extensions */
diff --git a/sound/soc/intel/common/sst-firmware.c b/sound/soc/intel/common/sst-firmware.c
index ef4881e7753a..25993527370b 100644
--- a/sound/soc/intel/common/sst-firmware.c
+++ b/sound/soc/intel/common/sst-firmware.c
@@ -203,7 +203,7 @@ static struct dw_dma_chip *dw_probe(struct device *dev, struct resource *mem,
 
 	chip->dev = dev;
 
-	err = dw_dma_probe(chip, NULL);
+	err = dw_dma_probe(chip);
 	if (err)
 		return ERR_PTR(err);
 
-- 
cgit v1.2.3


From 011d6f5c3e5f38a767c8f4c7e2de73dc91959cb0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 2 May 2016 12:59:19 +0200
Subject: of: include errno.h in of_graph.h

When CONFIG_OF is disabled, we have to include linux/errno.h before
including of_graph.h, or get build errors like in the newly added
sun4i drm driver:

In file included from ../drivers/gpu/drm/sun4i/sun4i_drv.c:14:0:
include/linux/of_graph.h: In function 'of_graph_parse_endpoint':
include/linux/of_graph.h:58:10: error: 'ENOSYS' undeclared (first use in this function)

A better solution is to ensure that the header can be included
by itself, so let's include linux/errno.h here to fix the error
we just got, and any similar future error.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 9026e0d122ac ("drm: Add Allwinner A10 Display Engine support")
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_graph.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index f8bcd0e21a26..bb3a5a2cd570 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -15,6 +15,7 @@
 #define __LINUX_OF_GRAPH_H
 
 #include <linux/types.h>
+#include <linux/errno.h>
 
 /**
  * struct of_endpoint - the OF graph endpoint data structure
-- 
cgit v1.2.3


From ba0263340a2aeaf5f08e4f2b0f4c29e300828b06 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Thu, 28 Apr 2016 17:18:34 +0200
Subject: fbdev: fb_defio: Export fb_deferred_io_mmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export fb_deferred_io_mmap so drivers can change vma->vm_page_prot.
When the framebuffer memory is allocated using dma_alloc_writecombine()
instead of vmalloc(), I get cache syncing problems on ARM.
This solves it:

static int drm_fbdev_cma_deferred_io_mmap(struct fb_info *info,
					  struct vm_area_struct *vma)
{
	fb_deferred_io_mmap(info, vma);
	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);

	return 0;
}

Could this have been done in the core?
Drivers that don't set (struct fb_ops *)->fb_mmap, gets a call to
fb_pgprotect() at the end of the default fb_mmap implementation
(drivers/video/fbdev/core/fbmem.c). This is an architecture specific
function that on many platforms uses pgprot_writecombine(), but not on
all. And looking at some of the fb_mmap implementations, some of them
sets vm_page_prot to nocache for instance, so I think the safest bet is
to do this in the driver and not in the fbdev core. And we can't call
fb_pgprotect() from fb_deferred_io_mmap() either because we don't have
access to the file pointer that powerpc needs.

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Acked-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1461856717-6476-5-git-send-email-noralf@tronnes.org
---
 drivers/video/fbdev/core/fb_defio.c | 3 ++-
 include/linux/fb.h                  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 57721c73177f..74b5bcac8bf2 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -164,7 +164,7 @@ static const struct address_space_operations fb_deferred_io_aops = {
 	.set_page_dirty = fb_deferred_io_set_page_dirty,
 };
 
-static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
+int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &fb_deferred_io_vm_ops;
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
@@ -173,6 +173,7 @@ static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	vma->vm_private_data = info;
 	return 0;
 }
+EXPORT_SYMBOL(fb_deferred_io_mmap);
 
 /* workqueue callback */
 static void fb_deferred_io_work(struct work_struct *work)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index dfe88351341f..a964d076b4dc 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -673,6 +673,7 @@ static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch,
 }
 
 /* drivers/video/fb_defio.c */
+int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma);
 extern void fb_deferred_io_init(struct fb_info *info);
 extern void fb_deferred_io_open(struct fb_info *info,
 				struct inode *inode,
-- 
cgit v1.2.3


From 339e6e31d2bfb40354cbfe672b357b88a88223f2 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Sun, 1 May 2016 18:42:47 -0400
Subject: isa: Implement the module_isa_driver macro

The module_isa_driver macro is a helper macro for ISA drivers which do
not do anything special in module init/exit. This eliminates a lot of
boilerplate code. Each module may only use this macro once, and calling
it replaces module_init and module_exit.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/isa.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index b0270e3814c8..e394917d18c2 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -36,4 +36,25 @@ static inline void isa_unregister_driver(struct isa_driver *d)
 }
 #endif
 
+/**
+ * module_isa_driver() - Helper macro for registering a ISA driver
+ * @__isa_driver: isa_driver struct
+ * @__num_isa_dev: number of devices to register
+ *
+ * Helper macro for ISA drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate code. Each module may only
+ * use this macro once, and calling it replaces module_init and module_exit.
+ */
+#define module_isa_driver(__isa_driver, __num_isa_dev) \
+static int __init __isa_driver##_init(void) \
+{ \
+	return isa_register_driver(&(__isa_driver), __num_isa_dev); \
+} \
+module_init(__isa_driver##_init); \
+static void __exit __isa_driver##_exit(void) \
+{ \
+	isa_unregister_driver(&(__isa_driver)); \
+} \
+module_exit(__isa_driver##_exit);
+
 #endif /* __LINUX_ISA_H */
-- 
cgit v1.2.3


From d9a9c6172d1cec51851e9015b6c4379635c31f1a Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Sun, 1 May 2016 18:43:10 -0400
Subject: isa: Implement the max_num_isa_dev macro

max_num_isa_dev is a macro to determine the maximum possible number of
ISA devices which may be registered in the I/O port address space given
the address extent of the ISA devices.

The highest base address possible for an ISA device is 0x3FF; this
results in 1024 possible base addresses. Dividing the number of possible
base addresses by the address extent taken by each device results in the
maximum number of devices on a system.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/isa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index e394917d18c2..5ab85281230b 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -57,4 +57,15 @@ static void __exit __isa_driver##_exit(void) \
 } \
 module_exit(__isa_driver##_exit);
 
+/**
+ * max_num_isa_dev() - Maximum possible number registered of an ISA device
+ * @__ida_dev_ext: ISA device address extent
+ *
+ * The highest base address possible for an ISA device is 0x3FF; this results in
+ * 1024 possible base addresses. Dividing the number of possible base addresses
+ * by the address extent taken by each device results in the maximum number of
+ * devices on a system.
+ */
+#define max_num_isa_dev(__isa_dev_ext) (1024 / __isa_dev_ext)
+
 #endif /* __LINUX_ISA_H */
-- 
cgit v1.2.3


From 4d31c6109a24892df461b6a98842935e80159a5e Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Mon, 2 May 2016 14:09:10 -0500
Subject: PCI: imx6: Implement reset sequence for i.MX6+

I.MX6+ has a dedicated bit for resetting PCIe core, which should be used
instead of a regular reset sequence since using the latter will hang the
SoC.

This commit is based on c34068d48273e24d392d9a49a38be807954420ed from
http://git.freescale.com/git/cgit.cgi/imx/linux-2.6-imx.git

Tested-by: Gary Bisson <gary.bisson@boundarydevices.com>
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
---
 .../devicetree/bindings/pci/fsl,imx6q-pcie.txt     |  2 +-
 drivers/pci/host/pci-imx6.c                        | 23 ++++++++++++++++++++--
 include/linux/mfd/syscon/imx6q-iomuxc-gpr.h        |  1 +
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
index d742f917505a..83aeb1f5a645 100644
--- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
@@ -4,7 +4,7 @@ This PCIe host controller is based on the Synopsis Designware PCIe IP
 and thus inherits all the common properties defined in designware-pcie.txt.
 
 Required properties:
-- compatible: "fsl,imx6q-pcie", "fsl,imx6sx-pcie"
+- compatible: "fsl,imx6q-pcie", "fsl,imx6sx-pcie", "fsl,imx6qp-pcie"
 - reg: base address and length of the PCIe controller
 - interrupts: A list of interrupt outputs of the controller. Must contain an
   entry for each entry in the interrupt-names property.
diff --git a/drivers/pci/host/pci-imx6.c b/drivers/pci/host/pci-imx6.c
index 6b5d8c2fe2f2..b741a36a67f3 100644
--- a/drivers/pci/host/pci-imx6.c
+++ b/drivers/pci/host/pci-imx6.c
@@ -34,7 +34,8 @@
 
 enum imx6_pcie_variants {
 	IMX6Q,
-	IMX6SX
+	IMX6SX,
+	IMX6QP,
 };
 
 struct imx6_pcie {
@@ -256,6 +257,11 @@ static int imx6_pcie_assert_core_reset(struct pcie_port *pp)
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET,
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET);
 		break;
+	case IMX6QP:
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
+				   IMX6Q_GPR1_PCIE_SW_RST,
+				   IMX6Q_GPR1_PCIE_SW_RST);
+		break;
 	case IMX6Q:
 		/*
 		 * If the bootloader already enabled the link we need some
@@ -310,6 +316,7 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie)
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 				   IMX6SX_GPR12_PCIE_TEST_POWERDOWN, 0);
 		break;
+	case IMX6QP: 		/* FALLTHROUGH */
 	case IMX6Q:
 		/* power up core phy and enable ref clock */
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
@@ -370,9 +377,20 @@ static int imx6_pcie_deassert_core_reset(struct pcie_port *pp)
 					!imx6_pcie->gpio_active_high);
 	}
 
-	if (imx6_pcie->variant == IMX6SX)
+	switch (imx6_pcie->variant) {
+	case IMX6SX:
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR5,
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET, 0);
+		break;
+	case IMX6QP:
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
+				   IMX6Q_GPR1_PCIE_SW_RST, 0);
+
+		usleep_range(200, 500);
+		break;
+	case IMX6Q:		/* Nothing to do */
+		break;
+	}
 
 	return 0;
 
@@ -718,6 +736,7 @@ static void imx6_pcie_shutdown(struct platform_device *pdev)
 static const struct of_device_id imx6_pcie_of_match[] = {
 	{ .compatible = "fsl,imx6q-pcie",  .data = (void *)IMX6Q,  },
 	{ .compatible = "fsl,imx6sx-pcie", .data = (void *)IMX6SX, },
+	{ .compatible = "fsl,imx6qp-pcie", .data = (void *)IMX6QP, },
 	{},
 };
 MODULE_DEVICE_TABLE(of, imx6_pcie_of_match);
diff --git a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
index 238c8db953eb..5b08e3c5325f 100644
--- a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
+++ b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
@@ -95,6 +95,7 @@
 #define IMX6Q_GPR0_DMAREQ_MUX_SEL0_IOMUX	BIT(0)
 
 #define IMX6Q_GPR1_PCIE_REQ_MASK		(0x3 << 30)
+#define IMX6Q_GPR1_PCIE_SW_RST			BIT(29)
 #define IMX6Q_GPR1_PCIE_EXIT_L1			BIT(28)
 #define IMX6Q_GPR1_PCIE_RDY_L23			BIT(27)
 #define IMX6Q_GPR1_PCIE_ENTER_L1		BIT(26)
-- 
cgit v1.2.3


From dcb0b5575d24a32f51a3f1003312fb94ed4e214a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 2 May 2016 21:30:04 -0400
Subject: tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic

Nothing sets TRACE_EVENT_FL_USE_CALL_FILTER anymore. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h       |  3 --
 kernel/trace/trace_events_filter.c | 71 ++++++--------------------------------
 2 files changed, 10 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3111a1efdad6..ba6456302e34 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -214,7 +214,6 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
-	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
 	TRACE_EVENT_FL_UPROBE_BIT,
@@ -229,7 +228,6 @@ enum {
  *  WAS_ENABLED   - Set and stays set when an event was ever enabled
  *                    (used for module unloading, if a module event is enabled,
  *                     it is best to clear the buffers that used it).
- *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
  *  UPROBE        - Event is a uprobe
@@ -240,7 +238,6 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
-	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b3f5051cd4e9..d1d27bf37a19 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 
 static inline struct event_filter *event_filter(struct trace_event_file *file)
 {
-	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		return file->event_call->filter;
-	else
-		return file->filter;
+	return file->filter;
 }
 
 /* caller must hold event_mutex */
@@ -826,12 +823,7 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags &= ~EVENT_FILE_FL_FILTERED;
+	file->flags &= ~EVENT_FILE_FL_FILTERED;
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -883,13 +875,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 static inline void __remove_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	filter_disable(file);
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		remove_filter_string(call->filter);
-	else
-		remove_filter_string(file->filter);
+	remove_filter_string(file->filter);
 }
 
 static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -906,15 +893,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
 
 static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
-		__free_filter(call->filter);
-		call->filter = NULL;
-	} else {
-		__free_filter(file->filter);
-		file->filter = NULL;
-	}
+	__free_filter(file->filter);
+	file->filter = NULL;
 }
 
 static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
@@ -1718,69 +1698,38 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags |= EVENT_FILE_FL_FILTERED;
+	file->flags |= EVENT_FILE_FL_FILTERED;
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		rcu_assign_pointer(call->filter, filter);
-	else
-		rcu_assign_pointer(file->filter, filter);
+	rcu_assign_pointer(file->filter, filter);
 }
 
 static inline void event_clear_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		RCU_INIT_POINTER(call->filter, NULL);
-	else
-		RCU_INIT_POINTER(file->filter, NULL);
+	RCU_INIT_POINTER(file->filter, NULL);
 }
 
 static inline void
 event_set_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline void
 event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline bool
 event_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
 		return true;
 
-	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
-	    (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
-		return true;
-
 	return false;
 }
 
-- 
cgit v1.2.3


From 502d6df11ae394301470703fa6e485a0dc133401 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Mon, 11 Apr 2016 16:32:54 +0100
Subject: irqchip/gic-v2: Parse and export virtual GIC information

For now, the firmware tables are parsed 2 times: once in the GIC
drivers, the other timer when initializing the vGIC. It means code
duplication and make more tedious to add the support for another
firmware table (like ACPI).

Introduce a new structure and set of helpers to get/set the virtual GIC
information. Also fill up the structure for GICv2.

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 drivers/irqchip/irq-gic-common.c       | 13 ++++++
 drivers/irqchip/irq-gic-common.h       |  3 ++
 drivers/irqchip/irq-gic.c              | 76 +++++++++++++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-common.h | 33 +++++++++++++++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/irqchip/arm-gic-common.h

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c
index f174ce0ca361..2e9443be2b14 100644
--- a/drivers/irqchip/irq-gic-common.c
+++ b/drivers/irqchip/irq-gic-common.c
@@ -21,6 +21,19 @@
 
 #include "irq-gic-common.h"
 
+static const struct gic_kvm_info *gic_kvm_info;
+
+const struct gic_kvm_info *gic_get_kvm_info(void)
+{
+	return gic_kvm_info;
+}
+
+void gic_set_kvm_info(const struct gic_kvm_info *info)
+{
+	BUG_ON(gic_kvm_info != NULL);
+	gic_kvm_info = info;
+}
+
 void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 		void *data)
 {
diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h
index fff697db8e22..205e5fddf6da 100644
--- a/drivers/irqchip/irq-gic-common.h
+++ b/drivers/irqchip/irq-gic-common.h
@@ -19,6 +19,7 @@
 
 #include <linux/of.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/arm-gic-common.h>
 
 struct gic_quirk {
 	const char *desc;
@@ -35,4 +36,6 @@ void gic_cpu_config(void __iomem *base, void (*sync_access)(void));
 void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 		void *data);
 
+void gic_set_kvm_info(const struct gic_kvm_info *info);
+
 #endif /* _IRQ_GIC_COMMON_H */
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 7a73786596cd..3f1d9fd3a462 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -102,6 +102,8 @@ static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
 static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;
 
+static struct gic_kvm_info gic_v2_kvm_info;
+
 #ifdef CONFIG_GIC_NON_BANKED
 static void __iomem *gic_get_percpu_base(union gic_base *base)
 {
@@ -1189,6 +1191,29 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base)
 	return true;
 }
 
+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	int ret;
+	struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+	struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+	gic_v2_kvm_info.type = GIC_V2;
+
+	gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v2_kvm_info.maint_irq)
+		return;
+
+	ret = of_address_to_resource(node, 2, vctrl_res);
+	if (ret)
+		return;
+
+	ret = of_address_to_resource(node, 3, vcpu_res);
+	if (ret)
+		return;
+
+	gic_set_kvm_info(&gic_v2_kvm_info);
+}
+
 int __init
 gic_of_init(struct device_node *node, struct device_node *parent)
 {
@@ -1218,8 +1243,10 @@ gic_of_init(struct device_node *node, struct device_node *parent)
 
 	__gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset,
 			 &node->fwnode);
-	if (!gic_cnt)
+	if (!gic_cnt) {
 		gic_init_physaddr(node);
+		gic_of_setup_kvm_info(node);
+	}
 
 	if (parent) {
 		irq = irq_of_parse_and_map(node, 0);
@@ -1248,6 +1275,10 @@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init);
 static struct
 {
 	phys_addr_t cpu_phys_base;
+	u32 maint_irq;
+	int maint_irq_mode;
+	phys_addr_t vctrl_base;
+	phys_addr_t vcpu_base;
 } acpi_data __initdata;
 
 static int __init
@@ -1272,6 +1303,12 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
 		return -EINVAL;
 
 	acpi_data.cpu_phys_base = gic_cpu_base;
+	acpi_data.maint_irq = processor->vgic_interrupt;
+	acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+				    ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+	acpi_data.vctrl_base = processor->gich_base_address;
+	acpi_data.vcpu_base = processor->gicv_base_address;
+
 	cpu_base_assigned = 1;
 	return 0;
 }
@@ -1302,6 +1339,41 @@ static bool __init gic_validate_dist(struct acpi_subtable_header *header,
 
 #define ACPI_GICV2_DIST_MEM_SIZE	(SZ_4K)
 #define ACPI_GIC_CPU_IF_MEM_SIZE	(SZ_8K)
+#define ACPI_GICV2_VCTRL_MEM_SIZE	(SZ_4K)
+#define ACPI_GICV2_VCPU_MEM_SIZE	(SZ_8K)
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	int irq;
+	struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+	struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+	gic_v2_kvm_info.type = GIC_V2;
+
+	if (!acpi_data.vctrl_base)
+		return;
+
+	vctrl_res->flags = IORESOURCE_MEM;
+	vctrl_res->start = acpi_data.vctrl_base;
+	vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1;
+
+	if (!acpi_data.vcpu_base)
+		return;
+
+	vcpu_res->flags = IORESOURCE_MEM;
+	vcpu_res->start = acpi_data.vcpu_base;
+	vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+
+	irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+				acpi_data.maint_irq_mode,
+				ACPI_ACTIVE_HIGH);
+	if (irq <= 0)
+		return;
+
+	gic_v2_kvm_info.maint_irq = irq;
+
+	gic_set_kvm_info(&gic_v2_kvm_info);
+}
 
 static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 				   const unsigned long end)
@@ -1359,6 +1431,8 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 	if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
 		gicv2m_init(NULL, gic_data[0].domain);
 
+	gic_acpi_setup_kvm_info();
+
 	return 0;
 }
 IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
new file mode 100644
index 000000000000..ef34f6f35e91
--- /dev/null
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -0,0 +1,33 @@
+/*
+ * include/linux/irqchip/arm-gic-common.h
+ *
+ * Copyright (C) 2016 ARM Limited, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __LINUX_IRQCHIP_ARM_GIC_COMMON_H
+#define __LINUX_IRQCHIP_ARM_GIC_COMMON_H
+
+#include <linux/types.h>
+#include <linux/ioport.h>
+
+enum gic_type {
+	GIC_V2,
+};
+
+struct gic_kvm_info {
+	/* GIC type */
+	enum gic_type	type;
+	/* Virtual CPU interface */
+	struct resource vcpu;
+	/* Interrupt number */
+	unsigned int	maint_irq;
+	/* Virtual control interface */
+	struct resource vctrl;
+};
+
+const struct gic_kvm_info *gic_get_kvm_info(void);
+
+#endif /* __LINUX_IRQCHIP_ARM_GIC_COMMON_H */
-- 
cgit v1.2.3


From 1839e576968f34b9a31da9f0033f8de12a1c9de6 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Mon, 11 Apr 2016 16:32:57 +0100
Subject: irqchip/gic-v3: Parse and export virtual GIC information

Fill up the recently introduced gic_kvm_info with the hardware
information used for virtualization.

Signed-off-by: Julien Grall <julien.grall@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 drivers/irqchip/irq-gic-v3.c           | 114 ++++++++++++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-common.h |   1 +
 2 files changed, 114 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 6a9f5ff161a4..05a856073714 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 
 #include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-common.h>
 #include <linux/irqchip/arm-gic-v3.h>
 
 #include <asm/cputype.h>
@@ -58,6 +59,8 @@ struct gic_chip_data {
 static struct gic_chip_data gic_data __read_mostly;
 static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
+static struct gic_kvm_info gic_v3_kvm_info;
+
 #define gic_data_rdist()		(this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_sgi_base()	(gic_data_rdist_rd_base() + SZ_64K)
@@ -903,6 +906,30 @@ static int __init gic_validate_dist_version(void __iomem *dist_base)
 	return 0;
 }
 
+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	int ret;
+	struct resource r;
+	u32 gicv_idx;
+
+	gic_v3_kvm_info.type = GIC_V3;
+
+	gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v3_kvm_info.maint_irq)
+		return;
+
+	if (of_property_read_u32(node, "#redistributor-regions",
+				 &gicv_idx))
+		gicv_idx = 1;
+
+	gicv_idx += 3;	/* Also skip GICD, GICC, GICH */
+	ret = of_address_to_resource(node, gicv_idx, &r);
+	if (!ret)
+		gic_v3_kvm_info.vcpu = r;
+
+	gic_set_kvm_info(&gic_v3_kvm_info);
+}
+
 static int __init gic_of_init(struct device_node *node, struct device_node *parent)
 {
 	void __iomem *dist_base;
@@ -954,8 +981,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 
 	err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
 			     redist_stride, &node->fwnode);
-	if (!err)
+	if (!err) {
+		gic_of_setup_kvm_info(node);
 		return 0;
+	}
 
 out_unmap_rdist:
 	for (i = 0; i < nr_redist_regions; i++)
@@ -976,6 +1005,9 @@ static struct
 	struct redist_region *redist_regs;
 	u32 nr_redist_regions;
 	bool single_redist;
+	u32 maint_irq;
+	int maint_irq_mode;
+	phys_addr_t vcpu_base;
 } acpi_data __initdata;
 
 static void __init
@@ -1112,7 +1144,85 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 	return true;
 }
 
+static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+						const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *gicc =
+		(struct acpi_madt_generic_interrupt *)header;
+	int maint_irq_mode;
+	static int first_madt = true;
+
+	/* Skip unusable CPUs */
+	if (!(gicc->flags & ACPI_MADT_ENABLED))
+		return 0;
+
+	maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+		ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+
+	if (first_madt) {
+		first_madt = false;
+
+		acpi_data.maint_irq = gicc->vgic_interrupt;
+		acpi_data.maint_irq_mode = maint_irq_mode;
+		acpi_data.vcpu_base = gicc->gicv_base_address;
+
+		return 0;
+	}
+
+	/*
+	 * The maintenance interrupt and GICV should be the same for every CPU
+	 */
+	if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
+	    (acpi_data.maint_irq_mode != maint_irq_mode) ||
+	    (acpi_data.vcpu_base != gicc->gicv_base_address))
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool __init gic_acpi_collect_virt_info(void)
+{
+	int count;
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+				      gic_acpi_parse_virt_madt_gicc, 0);
+
+	return (count > 0);
+}
+
 #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+#define ACPI_GICV2_VCTRL_MEM_SIZE	(SZ_4K)
+#define ACPI_GICV2_VCPU_MEM_SIZE	(SZ_8K)
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	int irq;
+
+	if (!gic_acpi_collect_virt_info()) {
+		pr_warn("Unable to get hardware information used for virtualization\n");
+		return;
+	}
+
+	gic_v3_kvm_info.type = GIC_V3;
+
+	irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+				acpi_data.maint_irq_mode,
+				ACPI_ACTIVE_HIGH);
+	if (irq <= 0)
+		return;
+
+	gic_v3_kvm_info.maint_irq = irq;
+
+	if (acpi_data.vcpu_base) {
+		struct resource *vcpu = &gic_v3_kvm_info.vcpu;
+
+		vcpu->flags = IORESOURCE_MEM;
+		vcpu->start = acpi_data.vcpu_base;
+		vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+	}
+
+	gic_set_kvm_info(&gic_v3_kvm_info);
+}
 
 static int __init
 gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
@@ -1161,6 +1271,8 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
 		goto out_fwhandle_free;
 
 	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+	gic_acpi_setup_kvm_info();
+
 	return 0;
 
 out_fwhandle_free:
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
index ef34f6f35e91..c647b0547bcd 100644
--- a/include/linux/irqchip/arm-gic-common.h
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -15,6 +15,7 @@
 
 enum gic_type {
 	GIC_V2,
+	GIC_V3,
 };
 
 struct gic_kvm_info {
-- 
cgit v1.2.3


From 10126ac14d36e74b2705802dc915b0b18463a51f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 2 May 2016 15:10:31 -0500
Subject: PCI: Add Downstream Port Containment portdrv service type

Add the Downstream Port Containment (PCIE_PORT_SERVICE_DPC) portdrv service
type, available if the device has the DPC extended capability.

[bhelgaas: split to separate patch, changelog]
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/portdrv.h      | 2 +-
 drivers/pci/pcie/portdrv_acpi.c | 2 +-
 drivers/pci/pcie/portdrv_core.c | 4 +++-
 include/linux/pcieport_if.h     | 2 ++
 include/uapi/linux/pci_regs.h   | 3 ++-
 5 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 63cb2ef9c5ae..7d82f6d47e68 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -11,7 +11,7 @@
 
 #include <linux/compiler.h>
 
-#define PCIE_PORT_DEVICE_MAXSERVICES   4
+#define PCIE_PORT_DEVICE_MAXSERVICES   5
 /*
  * According to the PCI Express Base Specification 2.0, the indices of
  * the MSI-X table entries used by port services must not exceed 31
diff --git a/drivers/pci/pcie/portdrv_acpi.c b/drivers/pci/pcie/portdrv_acpi.c
index b4d2894ee3fc..44296eb729d3 100644
--- a/drivers/pci/pcie/portdrv_acpi.c
+++ b/drivers/pci/pcie/portdrv_acpi.c
@@ -51,7 +51,7 @@ int pcie_port_acpi_setup(struct pci_dev *port, int *srv_mask)
 
 	flags = root->osc_control_set;
 
-	*srv_mask = PCIE_PORT_SERVICE_VC;
+	*srv_mask = PCIE_PORT_SERVICE_VC | PCIE_PORT_SERVICE_DPC;
 	if (flags & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL)
 		*srv_mask |= PCIE_PORT_SERVICE_HP;
 	if (flags & OSC_PCI_EXPRESS_PME_CONTROL)
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 94d3b82415c1..2ab0f424a378 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -262,7 +262,7 @@ static int get_port_device_capability(struct pci_dev *dev)
 		return 0;
 
 	cap_mask = PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP
-			| PCIE_PORT_SERVICE_VC;
+			| PCIE_PORT_SERVICE_VC | PCIE_PORT_SERVICE_DPC;
 	if (pci_aer_available())
 		cap_mask |= PCIE_PORT_SERVICE_AER;
 
@@ -311,6 +311,8 @@ static int get_port_device_capability(struct pci_dev *dev)
 		 */
 		pcie_pme_interrupt_enable(dev, false);
 	}
+	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC))
+		services |= PCIE_PORT_SERVICE_DPC;
 
 	return services;
 }
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 4f1089f2cc98..afcd130ab3a9 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -21,6 +21,8 @@
 #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
 #define PCIE_PORT_SERVICE_VC_SHIFT	3	/* Virtual Channel */
 #define PCIE_PORT_SERVICE_VC		(1 << PCIE_PORT_SERVICE_VC_SHIFT)
+#define PCIE_PORT_SERVICE_DPC_SHIFT	4	/* Downstream Port Containment */
+#define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
 
 struct pcie_device {
 	int		irq;	    /* Service IRQ/MSI/MSI-X Vector */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 1becea86c73c..61e95c142547 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -670,7 +670,8 @@
 #define PCI_EXT_CAP_ID_SECPCI	0x19	/* Secondary PCIe Capability */
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PASID
+#define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DPC
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
-- 
cgit v1.2.3


From e511267bc25e18926826e7cccdf7872bcbb4776a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 26 Apr 2016 11:38:20 +0100
Subject: io-64-nonatomic: Add relaxed accessor variants

Whilst commit 9439eb3ab9d1 ("asm-generic: io: implement relaxed
accessor macros as conditional wrappers") makes the *_relaxed forms of
I/O accessors universally available to drivers, in cases where writeq()
is implemented via the io-64-nonatomic helpers, writeq_relaxed() will
end up falling back to writel() regardless of whether writel_relaxed()
is available (identically for s/write/read/).

Add corresponding relaxed forms of the nonatomic helpers to delegate
to the equivalent 32-bit accessors as appropriate. We also need to fix
io.h to avoid defining default relaxed variants if the basic accessors
themselves don't exist.

CC: Christoph Hellwig <hch@lst.de>
CC: Darren Hart <dvhart@linux.intel.com>
CC: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/asm-generic/io.h              |  4 ++--
 include/linux/io-64-nonatomic-hi-lo.h | 25 +++++++++++++++++++++++++
 include/linux/io-64-nonatomic-lo-hi.h | 25 +++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index eed3bbe88c8a..002b81f6f2bc 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -191,7 +191,7 @@ static inline void writeq(u64 value, volatile void __iomem *addr)
 #define readl_relaxed readl
 #endif
 
-#ifndef readq_relaxed
+#if defined(readq) && !defined(readq_relaxed)
 #define readq_relaxed readq
 #endif
 
@@ -207,7 +207,7 @@ static inline void writeq(u64 value, volatile void __iomem *addr)
 #define writel_relaxed writel
 #endif
 
-#ifndef writeq_relaxed
+#if defined(writeq) && !defined(writeq_relaxed)
 #define writeq_relaxed writeq
 #endif
 
diff --git a/include/linux/io-64-nonatomic-hi-lo.h b/include/linux/io-64-nonatomic-hi-lo.h
index 11d7e840d913..defcc4644ce3 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -21,6 +21,23 @@ static inline void hi_lo_writeq(__u64 val, volatile void __iomem *addr)
 	writel(val, addr);
 }
 
+static inline __u64 hi_lo_readq_relaxed(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	high = readl_relaxed(p + 1);
+	low = readl_relaxed(p);
+
+	return low + ((u64)high << 32);
+}
+
+static inline void hi_lo_writeq_relaxed(__u64 val, volatile void __iomem *addr)
+{
+	writel_relaxed(val >> 32, addr + 4);
+	writel_relaxed(val, addr);
+}
+
 #ifndef readq
 #define readq hi_lo_readq
 #endif
@@ -29,4 +46,12 @@ static inline void hi_lo_writeq(__u64 val, volatile void __iomem *addr)
 #define writeq hi_lo_writeq
 #endif
 
+#ifndef readq_relaxed
+#define readq_relaxed hi_lo_readq_relaxed
+#endif
+
+#ifndef writeq_relaxed
+#define writeq_relaxed hi_lo_writeq_relaxed
+#endif
+
 #endif	/* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h b/include/linux/io-64-nonatomic-lo-hi.h
index 1a4315f97360..084461a4e5ab 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -21,6 +21,23 @@ static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
 	writel(val >> 32, addr + 4);
 }
 
+static inline __u64 lo_hi_readq_relaxed(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl_relaxed(p);
+	high = readl_relaxed(p + 1);
+
+	return low + ((u64)high << 32);
+}
+
+static inline void lo_hi_writeq_relaxed(__u64 val, volatile void __iomem *addr)
+{
+	writel_relaxed(val, addr);
+	writel_relaxed(val >> 32, addr + 4);
+}
+
 #ifndef readq
 #define readq lo_hi_readq
 #endif
@@ -29,4 +46,12 @@ static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
 #define writeq lo_hi_writeq
 #endif
 
+#ifndef readq_relaxed
+#define readq_relaxed lo_hi_readq_relaxed
+#endif
+
+#ifndef writeq_relaxed
+#define writeq_relaxed lo_hi_writeq_relaxed
+#endif
+
 #endif	/* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
-- 
cgit v1.2.3


From 6fb650d43da3e7054984dc548eaa88765a94d49f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 29 Apr 2016 15:25:17 -0400
Subject: USB: leave LPM alone if possible when binding/unbinding interface
 drivers

When a USB driver is bound to an interface (either through probing or
by claiming it) or is unbound from an interface, the USB core always
disables Link Power Management during the transition and then
re-enables it afterward.  The reason is because the driver might want
to prevent hub-initiated link power transitions, in which case the HCD
would have to recalculate the various LPM parameters.  This
recalculation takes place when LPM is re-enabled and the new
parameters are sent to the device and its parent hub.

However, if the driver does not want to prevent hub-initiated link
power transitions then none of this work is necessary.  The parameters
don't need to be recalculated, and LPM doesn't need to be disabled and
re-enabled.

It turns out that disabling and enabling LPM can be time-consuming,
enough so that it interferes with user programs that want to claim and
release interfaces rapidly via usbfs.  Since the usbfs kernel driver
doesn't set the disable_hub_initiated_lpm flag, we can speed things up
and get the user programs to work by leaving LPM alone whenever the
flag isn't set.

And while we're improving the way disable_hub_initiated_lpm gets used,
let's also fix its kerneldoc.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Matthew Giassa <matthew@giassa.net>
CC: Mathias Nyman <mathias.nyman@intel.com>
CC: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c | 40 +++++++++++++++++++++++-----------------
 include/linux/usb.h       |  2 +-
 2 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 2057d91d8336..dadd1e8dfe09 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -284,7 +284,7 @@ static int usb_probe_interface(struct device *dev)
 	struct usb_device *udev = interface_to_usbdev(intf);
 	const struct usb_device_id *id;
 	int error = -ENODEV;
-	int lpm_disable_error;
+	int lpm_disable_error = -ENODEV;
 
 	dev_dbg(dev, "%s\n", __func__);
 
@@ -336,12 +336,14 @@ static int usb_probe_interface(struct device *dev)
 	 * setting during probe, that should also be fine.  usb_set_interface()
 	 * will attempt to disable LPM, and fail if it can't disable it.
 	 */
-	lpm_disable_error = usb_unlocked_disable_lpm(udev);
-	if (lpm_disable_error && driver->disable_hub_initiated_lpm) {
-		dev_err(&intf->dev, "%s Failed to disable LPM for driver %s\n.",
-				__func__, driver->name);
-		error = lpm_disable_error;
-		goto err;
+	if (driver->disable_hub_initiated_lpm) {
+		lpm_disable_error = usb_unlocked_disable_lpm(udev);
+		if (lpm_disable_error) {
+			dev_err(&intf->dev, "%s Failed to disable LPM for driver %s\n.",
+					__func__, driver->name);
+			error = lpm_disable_error;
+			goto err;
+		}
 	}
 
 	/* Carry out a deferred switch to altsetting 0 */
@@ -391,7 +393,8 @@ static int usb_unbind_interface(struct device *dev)
 	struct usb_interface *intf = to_usb_interface(dev);
 	struct usb_host_endpoint *ep, **eps = NULL;
 	struct usb_device *udev;
-	int i, j, error, r, lpm_disable_error;
+	int i, j, error, r;
+	int lpm_disable_error = -ENODEV;
 
 	intf->condition = USB_INTERFACE_UNBINDING;
 
@@ -399,12 +402,13 @@ static int usb_unbind_interface(struct device *dev)
 	udev = interface_to_usbdev(intf);
 	error = usb_autoresume_device(udev);
 
-	/* Hub-initiated LPM policy may change, so attempt to disable LPM until
+	/* If hub-initiated LPM policy may change, attempt to disable LPM until
 	 * the driver is unbound.  If LPM isn't disabled, that's fine because it
 	 * wouldn't be enabled unless all the bound interfaces supported
 	 * hub-initiated LPM.
 	 */
-	lpm_disable_error = usb_unlocked_disable_lpm(udev);
+	if (driver->disable_hub_initiated_lpm)
+		lpm_disable_error = usb_unlocked_disable_lpm(udev);
 
 	/*
 	 * Terminate all URBs for this interface unless the driver
@@ -505,7 +509,7 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 	struct device *dev;
 	struct usb_device *udev;
 	int retval = 0;
-	int lpm_disable_error;
+	int lpm_disable_error = -ENODEV;
 
 	if (!iface)
 		return -ENODEV;
@@ -526,12 +530,14 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 
 	iface->condition = USB_INTERFACE_BOUND;
 
-	/* Disable LPM until this driver is bound. */
-	lpm_disable_error = usb_unlocked_disable_lpm(udev);
-	if (lpm_disable_error && driver->disable_hub_initiated_lpm) {
-		dev_err(&iface->dev, "%s Failed to disable LPM for driver %s\n.",
-				__func__, driver->name);
-		return -ENOMEM;
+	/* See the comment about disabling LPM in usb_probe_interface(). */
+	if (driver->disable_hub_initiated_lpm) {
+		lpm_disable_error = usb_unlocked_disable_lpm(udev);
+		if (lpm_disable_error) {
+			dev_err(&iface->dev, "%s Failed to disable LPM for driver %s\n.",
+					__func__, driver->name);
+			return -ENOMEM;
+		}
 	}
 
 	/* Claimed interfaces are initially inactive (suspended) and
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 01b6c61cf9bb..eba1f10e8cfd 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1068,7 +1068,7 @@ struct usbdrv_wrap {
  *	for interfaces bound to this driver.
  * @soft_unbind: if set to 1, the USB core will not kill URBs and disable
  *	endpoints before calling the driver's disconnect method.
- * @disable_hub_initiated_lpm: if set to 0, the USB core will not allow hubs
+ * @disable_hub_initiated_lpm: if set to 1, the USB core will not allow hubs
  *	to initiate lower power link state transitions when an idle timeout
  *	occurs.  Device-initiated USB 3.0 link PM will still be allowed.
  *
-- 
cgit v1.2.3


From 8e996a2874bbbed30e8dfe881453825fc6b7654e Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 3 May 2016 11:33:37 -0600
Subject: stm class: Support devices that override software assigned masters

Some STM devices adjust software assigned master numbers depending on
the trace source and its runtime state and whatnot. This patch adds
a sysfs attribute to inform the trace-side software that master numbers
assigned to software sources will not match those in the STP stream,
so that, for example, master/channel allocation policy can be adjusted
accordingly.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-stm | 10 ++++++++++
 drivers/hwtracing/stm/core.c              | 15 +++++++++++++++
 include/linux/stm.h                       |  3 +++
 3 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-stm b/Documentation/ABI/testing/sysfs-class-stm
index c9aa4f3fc9a7..77ed3da0f68e 100644
--- a/Documentation/ABI/testing/sysfs-class-stm
+++ b/Documentation/ABI/testing/sysfs-class-stm
@@ -12,3 +12,13 @@ KernelVersion:	4.3
 Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 Description:
 		Shows the number of channels per master on this STM device.
+
+What:		/sys/class/stm/<stm>/hw_override
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+		Reads as 0 if master numbers in the STP stream produced by
+		this stm device will match the master numbers assigned by
+		the software or 1 if the stm hardware overrides software
+		assigned masters.
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 2591442e2c5b..ff31108b066f 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -67,9 +67,24 @@ static ssize_t channels_show(struct device *dev,
 
 static DEVICE_ATTR_RO(channels);
 
+static ssize_t hw_override_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct stm_device *stm = to_stm_device(dev);
+	int ret;
+
+	ret = sprintf(buf, "%u\n", stm->data->hw_override);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(hw_override);
+
 static struct attribute *stm_attrs[] = {
 	&dev_attr_masters.attr,
 	&dev_attr_channels.attr,
+	&dev_attr_hw_override.attr,
 	NULL,
 };
 
diff --git a/include/linux/stm.h b/include/linux/stm.h
index 1a79ed8e43da..8369d8a8cabd 100644
--- a/include/linux/stm.h
+++ b/include/linux/stm.h
@@ -50,6 +50,8 @@ struct stm_device;
  * @sw_end:		last STP master available to software
  * @sw_nchannels:	number of STP channels per master
  * @sw_mmiosz:		size of one channel's IO space, for mmap, optional
+ * @hw_override:	masters in the STP stream will not match the ones
+ *			assigned by software, but are up to the STM hardware
  * @packet:		callback that sends an STP packet
  * @mmio_addr:		mmap callback, optional
  * @link:		called when a new stm_source gets linked to us, optional
@@ -85,6 +87,7 @@ struct stm_data {
 	unsigned int		sw_end;
 	unsigned int		sw_nchannels;
 	unsigned int		sw_mmiosz;
+	unsigned int		hw_override;
 	ssize_t			(*packet)(struct stm_data *, unsigned int,
 					  unsigned int, unsigned int,
 					  unsigned int, unsigned int,
-- 
cgit v1.2.3


From 237483aa5cf43105d148d3f03b29eed47c3e6cf9 Mon Sep 17 00:00:00 2001
From: Pratik Patel <pratikp@codeaurora.org>
Date: Tue, 3 May 2016 11:33:40 -0600
Subject: coresight: stm: adding driver for CoreSight STM component

This driver adds support for the STM CoreSight IP block, allowing any
system compoment (HW or SW) to log and aggregate messages via a
single entity.

The CoreSight STM exposes an application defined number of channels
called stimulus port.  Configuration is done using entries in sysfs
and channels made available to userspace via configfs.

Signed-off-by: Pratik Patel <pratikp@codeaurora.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Michael Williams <michael.williams@arm.com>
Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-bus-coresight-devices-stm    |  53 ++
 Documentation/trace/coresight.txt                  |  37 +-
 drivers/hwtracing/coresight/Kconfig                |  11 +
 drivers/hwtracing/coresight/Makefile               |   1 +
 drivers/hwtracing/coresight/coresight-stm.c        | 920 +++++++++++++++++++++
 include/linux/coresight-stm.h                      |   6 +
 include/uapi/linux/coresight-stm.h                 |  21 +
 7 files changed, 1047 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
 create mode 100644 drivers/hwtracing/coresight/coresight-stm.c
 create mode 100644 include/linux/coresight-stm.h
 create mode 100644 include/uapi/linux/coresight-stm.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
new file mode 100644
index 000000000000..1dffabe7f48d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
@@ -0,0 +1,53 @@
+What:		/sys/bus/coresight/devices/<memory_map>.stm/enable_source
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Enable/disable tracing on this specific trace macrocell.
+		Enabling the trace macrocell implies it has been configured
+		properly and a sink has been identified for it.  The path
+		of coresight components linking the source to the sink is
+		configured and managed automatically by the coresight framework.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/hwevent_enable
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Provides access to the HW event enable register, used in
+		conjunction with HW event bank select register.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/hwevent_select
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Gives access to the HW event block select register
+		(STMHEBSR) in order to configure up to 256 channels.  Used in
+		conjunction with "hwevent_enable" register as described above.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/port_enable
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Provides access to the stimulus port enable register
+		(STMSPER).  Used in conjunction with "port_select" described
+		below.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/port_select
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Used to determine which bank of stimulus port bit in
+		register STMSPER (see above) apply to.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/status
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(R) List various control and status registers.  The specific
+		layout and content is driver specific.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/traceid
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Holds the trace ID that will appear in the trace stream
+		coming from this trace entity.
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt
index 0a5c3290e732..a33c88cd5d1d 100644
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -190,8 +190,8 @@ expected to be accessed and controlled using those entries.
 Last but not least, "struct module *owner" is expected to be set to reflect
 the information carried in "THIS_MODULE".
 
-How to use
-----------
+How to use the tracer modules
+-----------------------------
 
 Before trace collection can start, a coresight sink needs to be identify.
 There is no limit on the amount of sinks (nor sources) that can be enabled at
@@ -297,3 +297,36 @@ Info                                    Tracing enabled
 Instruction     13570831        0x8026B584      E28DD00C        false   ADD      sp,sp,#0xc
 Instruction     0       0x8026B588      E8BD8000        true    LDM      sp!,{pc}
 Timestamp                                       Timestamp: 17107041535
+
+How to use the STM module
+-------------------------
+
+Using the System Trace Macrocell module is the same as the tracers - the only
+difference is that clients are driving the trace capture rather
+than the program flow through the code.
+
+As with any other CoreSight component, specifics about the STM tracer can be
+found in sysfs with more information on each entry being found in [1]:
+
+root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
+enable_source   hwevent_select  port_enable     subsystem       uevent
+hwevent_enable  mgmt            port_select     traceid
+root@genericarmv8:~#
+
+Like any other source a sink needs to be identified and the STM enabled before
+being used:
+
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+
+From there user space applications can request and use channels using the devfs
+interface provided for that purpose by the generic STM API:
+
+root@genericarmv8:~# ls -l /dev/20100000.stm
+crw-------    1 root     root       10,  61 Jan  3 18:11 /dev/20100000.stm
+root@genericarmv8:~#
+
+Details on how to use the generic STM API can be found here [2].
+
+[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
+[2]. Documentation/trace/stm.txt
diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig
index db0541031c72..130cb2114059 100644
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -78,4 +78,15 @@ config CORESIGHT_QCOM_REPLICATOR
 	  programmable ATB replicator sends the ATB trace stream from the
 	  ETB/ETF to the TPIUi and ETR.
 
+config CORESIGHT_STM
+	bool "CoreSight System Trace Macrocell driver"
+	depends on (ARM && !(CPU_32v3 || CPU_32v4 || CPU_32v4T)) || ARM64
+	select CORESIGHT_LINKS_AND_SINKS
+	select STM
+	help
+	  This driver provides support for hardware assisted software
+	  instrumentation based tracing. This is primarily used for
+	  logging useful software events or data coming from various entities
+	  in the system, possibly running different OSs
+
 endif
diff --git a/drivers/hwtracing/coresight/Makefile b/drivers/hwtracing/coresight/Makefile
index 1d0e32c7dbe4..c6f84b57f52a 100644
--- a/drivers/hwtracing/coresight/Makefile
+++ b/drivers/hwtracing/coresight/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_CORESIGHT_SOURCE_ETM3X) += coresight-etm3x.o coresight-etm-cp14.o \
 obj-$(CONFIG_CORESIGHT_SOURCE_ETM4X) += coresight-etm4x.o \
 					coresight-etm4x-sysfs.o
 obj-$(CONFIG_CORESIGHT_QCOM_REPLICATOR) += coresight-replicator-qcom.o
+obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
new file mode 100644
index 000000000000..73be58a11e4f
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -0,0 +1,920 @@
+/* Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight System Trace Macrocell driver
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Initial implementation by Pratik Patel
+ * (C) 2014-2015 Pratik Patel <pratikp@codeaurora.org>
+ *
+ * Serious refactoring, code cleanup and upgrading to the Coresight upstream
+ * framework by Mathieu Poirier
+ * (C) 2015-2016 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * Guaranteed timing and support for various packet type coming from the
+ * generic STM API by Chunyan Zhang
+ * (C) 2015-2016 Chunyan Zhang <zhang.chunyan@linaro.org>
+ */
+#include <asm/local.h>
+#include <linux/amba/bus.h>
+#include <linux/bitmap.h>
+#include <linux/clk.h>
+#include <linux/coresight.h>
+#include <linux/coresight-stm.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/of_address.h>
+#include <linux/perf_event.h>
+#include <linux/pm_runtime.h>
+#include <linux/stm.h>
+
+#include "coresight-priv.h"
+
+#define STMDMASTARTR			0xc04
+#define STMDMASTOPR			0xc08
+#define STMDMASTATR			0xc0c
+#define STMDMACTLR			0xc10
+#define STMDMAIDR			0xcfc
+#define STMHEER				0xd00
+#define STMHETER			0xd20
+#define STMHEBSR			0xd60
+#define STMHEMCR			0xd64
+#define STMHEMASTR			0xdf4
+#define STMHEFEAT1R			0xdf8
+#define STMHEIDR			0xdfc
+#define STMSPER				0xe00
+#define STMSPTER			0xe20
+#define STMPRIVMASKR			0xe40
+#define STMSPSCR			0xe60
+#define STMSPMSCR			0xe64
+#define STMSPOVERRIDER			0xe68
+#define STMSPMOVERRIDER			0xe6c
+#define STMSPTRIGCSR			0xe70
+#define STMTCSR				0xe80
+#define STMTSSTIMR			0xe84
+#define STMTSFREQR			0xe8c
+#define STMSYNCR			0xe90
+#define STMAUXCR			0xe94
+#define STMSPFEAT1R			0xea0
+#define STMSPFEAT2R			0xea4
+#define STMSPFEAT3R			0xea8
+#define STMITTRIGGER			0xee8
+#define STMITATBDATA0			0xeec
+#define STMITATBCTR2			0xef0
+#define STMITATBID			0xef4
+#define STMITATBCTR0			0xef8
+
+#define STM_32_CHANNEL			32
+#define BYTES_PER_CHANNEL		256
+#define STM_TRACE_BUF_SIZE		4096
+#define STM_SW_MASTER_END		127
+
+/* Register bit definition */
+#define STMTCSR_BUSY_BIT		23
+/* Reserve the first 10 channels for kernel usage */
+#define STM_CHANNEL_OFFSET		0
+
+enum stm_pkt_type {
+	STM_PKT_TYPE_DATA	= 0x98,
+	STM_PKT_TYPE_FLAG	= 0xE8,
+	STM_PKT_TYPE_TRIG	= 0xF8,
+};
+
+#define stm_channel_addr(drvdata, ch)	(drvdata->chs.base +	\
+					(ch * BYTES_PER_CHANNEL))
+#define stm_channel_off(type, opts)	(type & ~opts)
+
+static int boot_nr_channel;
+
+/*
+ * Not really modular but using module_param is the easiest way to
+ * remain consistent with existing use cases for now.
+ */
+module_param_named(
+	boot_nr_channel, boot_nr_channel, int, S_IRUGO
+);
+
+/**
+ * struct channel_space - central management entity for extended ports
+ * @base:		memory mapped base address where channels start.
+ * @guaraneed:		is the channel delivery guaranteed.
+ */
+struct channel_space {
+	void __iomem		*base;
+	unsigned long		*guaranteed;
+};
+
+/**
+ * struct stm_drvdata - specifics associated to an STM component
+ * @base:		memory mapped base address for this component.
+ * @dev:		the device entity associated to this component.
+ * @atclk:		optional clock for the core parts of the STM.
+ * @csdev:		component vitals needed by the framework.
+ * @spinlock:		only one at a time pls.
+ * @chs:		the channels accociated to this STM.
+ * @stm:		structure associated to the generic STM interface.
+ * @mode:		this tracer's mode, i.e sysFS, or disabled.
+ * @traceid:		value of the current ID for this component.
+ * @write_bytes:	Maximus bytes this STM can write at a time.
+ * @stmsper:		settings for register STMSPER.
+ * @stmspscr:		settings for register STMSPSCR.
+ * @numsp:		the total number of stimulus port support by this STM.
+ * @stmheer:		settings for register STMHEER.
+ * @stmheter:		settings for register STMHETER.
+ * @stmhebsr:		settings for register STMHEBSR.
+ */
+struct stm_drvdata {
+	void __iomem		*base;
+	struct device		*dev;
+	struct clk		*atclk;
+	struct coresight_device	*csdev;
+	spinlock_t		spinlock;
+	struct channel_space	chs;
+	struct stm_data		stm;
+	local_t			mode;
+	u8			traceid;
+	u32			write_bytes;
+	u32			stmsper;
+	u32			stmspscr;
+	u32			numsp;
+	u32			stmheer;
+	u32			stmheter;
+	u32			stmhebsr;
+};
+
+static void stm_hwevent_enable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+
+	writel_relaxed(drvdata->stmhebsr, drvdata->base + STMHEBSR);
+	writel_relaxed(drvdata->stmheter, drvdata->base + STMHETER);
+	writel_relaxed(drvdata->stmheer, drvdata->base + STMHEER);
+	writel_relaxed(0x01 |	/* Enable HW event tracing */
+		       0x04,	/* Error detection on event tracing */
+		       drvdata->base + STMHEMCR);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_port_enable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+	/* ATB trigger enable on direct writes to TRIG locations */
+	writel_relaxed(0x10,
+		       drvdata->base + STMSPTRIGCSR);
+	writel_relaxed(drvdata->stmspscr, drvdata->base + STMSPSCR);
+	writel_relaxed(drvdata->stmsper, drvdata->base + STMSPER);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_enable_hw(struct stm_drvdata *drvdata)
+{
+	if (drvdata->stmheer)
+		stm_hwevent_enable_hw(drvdata);
+
+	stm_port_enable_hw(drvdata);
+
+	CS_UNLOCK(drvdata->base);
+
+	/* 4096 byte between synchronisation packets */
+	writel_relaxed(0xFFF, drvdata->base + STMSYNCR);
+	writel_relaxed((drvdata->traceid << 16 | /* trace id */
+			0x02 |			 /* timestamp enable */
+			0x01),			 /* global STM enable */
+			drvdata->base + STMTCSR);
+
+	CS_LOCK(drvdata->base);
+}
+
+static int stm_enable(struct coresight_device *csdev,
+		      struct perf_event_attr *attr, u32 mode)
+{
+	u32 val;
+	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+	if (mode != CS_MODE_SYSFS)
+		return -EINVAL;
+
+	val = local_cmpxchg(&drvdata->mode, CS_MODE_DISABLED, mode);
+
+	/* Someone is already using the tracer */
+	if (val)
+		return -EBUSY;
+
+	pm_runtime_get_sync(drvdata->dev);
+
+	spin_lock(&drvdata->spinlock);
+	stm_enable_hw(drvdata);
+	spin_unlock(&drvdata->spinlock);
+
+	dev_info(drvdata->dev, "STM tracing enabled\n");
+	return 0;
+}
+
+static void stm_hwevent_disable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+
+	writel_relaxed(0x0, drvdata->base + STMHEMCR);
+	writel_relaxed(0x0, drvdata->base + STMHEER);
+	writel_relaxed(0x0, drvdata->base + STMHETER);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_port_disable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+
+	writel_relaxed(0x0, drvdata->base + STMSPER);
+	writel_relaxed(0x0, drvdata->base + STMSPTRIGCSR);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_disable_hw(struct stm_drvdata *drvdata)
+{
+	u32 val;
+
+	CS_UNLOCK(drvdata->base);
+
+	val = readl_relaxed(drvdata->base + STMTCSR);
+	val &= ~0x1; /* clear global STM enable [0] */
+	writel_relaxed(val, drvdata->base + STMTCSR);
+
+	CS_LOCK(drvdata->base);
+
+	stm_port_disable_hw(drvdata);
+	if (drvdata->stmheer)
+		stm_hwevent_disable_hw(drvdata);
+}
+
+static void stm_disable(struct coresight_device *csdev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+	/*
+	 * For as long as the tracer isn't disabled another entity can't
+	 * change its status.  As such we can read the status here without
+	 * fearing it will change under us.
+	 */
+	if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+		spin_lock(&drvdata->spinlock);
+		stm_disable_hw(drvdata);
+		spin_unlock(&drvdata->spinlock);
+
+		/* Wait until the engine has completely stopped */
+		coresight_timeout(drvdata, STMTCSR, STMTCSR_BUSY_BIT, 0);
+
+		pm_runtime_put(drvdata->dev);
+
+		local_set(&drvdata->mode, CS_MODE_DISABLED);
+		dev_info(drvdata->dev, "STM tracing disabled\n");
+	}
+}
+
+static int stm_trace_id(struct coresight_device *csdev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+	return drvdata->traceid;
+}
+
+static const struct coresight_ops_source stm_source_ops = {
+	.trace_id	= stm_trace_id,
+	.enable		= stm_enable,
+	.disable	= stm_disable,
+};
+
+static const struct coresight_ops stm_cs_ops = {
+	.source_ops	= &stm_source_ops,
+};
+
+static inline bool stm_addr_unaligned(const void *addr, u8 write_bytes)
+{
+	return ((unsigned long)addr & (write_bytes - 1));
+}
+
+static void stm_send(void *addr, const void *data, u32 size, u8 write_bytes)
+{
+	u8 paload[8];
+
+	if (stm_addr_unaligned(data, write_bytes)) {
+		memcpy(paload, data, size);
+		data = paload;
+	}
+
+	/* now we are 64bit/32bit aligned */
+	switch (size) {
+#ifdef CONFIG_64BIT
+	case 8:
+		writeq_relaxed(*(u64 *)data, addr);
+		break;
+#endif
+	case 4:
+		writel_relaxed(*(u32 *)data, addr);
+		break;
+	case 2:
+		writew_relaxed(*(u16 *)data, addr);
+		break;
+	case 1:
+		writeb_relaxed(*(u8 *)data, addr);
+		break;
+	default:
+		break;
+	}
+}
+
+static int stm_generic_link(struct stm_data *stm_data,
+			    unsigned int master,  unsigned int channel)
+{
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+	if (!drvdata || !drvdata->csdev)
+		return -EINVAL;
+
+	return coresight_enable(drvdata->csdev);
+}
+
+static void stm_generic_unlink(struct stm_data *stm_data,
+			       unsigned int master,  unsigned int channel)
+{
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+	if (!drvdata || !drvdata->csdev)
+		return;
+
+	stm_disable(drvdata->csdev);
+}
+
+static long stm_generic_set_options(struct stm_data *stm_data,
+				    unsigned int master,
+				    unsigned int channel,
+				    unsigned int nr_chans,
+				    unsigned long options)
+{
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+	if (!(drvdata && local_read(&drvdata->mode)))
+		return -EINVAL;
+
+	if (channel >= drvdata->numsp)
+		return -EINVAL;
+
+	switch (options) {
+	case STM_OPTION_GUARANTEED:
+		set_bit(channel, drvdata->chs.guaranteed);
+		break;
+
+	case STM_OPTION_INVARIANT:
+		clear_bit(channel, drvdata->chs.guaranteed);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static ssize_t stm_generic_packet(struct stm_data *stm_data,
+				  unsigned int master,
+				  unsigned int channel,
+				  unsigned int packet,
+				  unsigned int flags,
+				  unsigned int size,
+				  const unsigned char *payload)
+{
+	unsigned long ch_addr;
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+
+	if (!(drvdata && local_read(&drvdata->mode)))
+		return 0;
+
+	if (channel >= drvdata->numsp)
+		return 0;
+
+	ch_addr = (unsigned long)stm_channel_addr(drvdata, channel);
+
+	flags = (flags == STP_PACKET_TIMESTAMPED) ? STM_FLAG_TIMESTAMPED : 0;
+	flags |= test_bit(channel, drvdata->chs.guaranteed) ?
+			   STM_FLAG_GUARANTEED : 0;
+
+	if (size > drvdata->write_bytes)
+		size = drvdata->write_bytes;
+	else
+		size = rounddown_pow_of_two(size);
+
+	switch (packet) {
+	case STP_PACKET_FLAG:
+		ch_addr |= stm_channel_off(STM_PKT_TYPE_FLAG, flags);
+
+		/*
+		 * The generic STM core sets a size of '0' on flag packets.
+		 * As such send a flag packet of size '1' and tell the
+		 * core we did so.
+		 */
+		stm_send((void *)ch_addr, payload, 1, drvdata->write_bytes);
+		size = 1;
+		break;
+
+	case STP_PACKET_DATA:
+		ch_addr |= stm_channel_off(STM_PKT_TYPE_DATA, flags);
+		stm_send((void *)ch_addr, payload, size,
+				drvdata->write_bytes);
+		break;
+
+	default:
+		return -ENOTSUPP;
+	}
+
+	return size;
+}
+
+static ssize_t hwevent_enable_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val = drvdata->stmheer;
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t hwevent_enable_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return -EINVAL;
+
+	drvdata->stmheer = val;
+	/* HW event enable and trigger go hand in hand */
+	drvdata->stmheter = val;
+
+	return size;
+}
+static DEVICE_ATTR_RW(hwevent_enable);
+
+static ssize_t hwevent_select_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val = drvdata->stmhebsr;
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t hwevent_select_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return -EINVAL;
+
+	drvdata->stmhebsr = val;
+
+	return size;
+}
+static DEVICE_ATTR_RW(hwevent_select);
+
+static ssize_t port_select_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+
+	if (!local_read(&drvdata->mode)) {
+		val = drvdata->stmspscr;
+	} else {
+		spin_lock(&drvdata->spinlock);
+		val = readl_relaxed(drvdata->base + STMSPSCR);
+		spin_unlock(&drvdata->spinlock);
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t port_select_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val, stmsper;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return ret;
+
+	spin_lock(&drvdata->spinlock);
+	drvdata->stmspscr = val;
+
+	if (local_read(&drvdata->mode)) {
+		CS_UNLOCK(drvdata->base);
+		/* Process as per ARM's TRM recommendation */
+		stmsper = readl_relaxed(drvdata->base + STMSPER);
+		writel_relaxed(0x0, drvdata->base + STMSPER);
+		writel_relaxed(drvdata->stmspscr, drvdata->base + STMSPSCR);
+		writel_relaxed(stmsper, drvdata->base + STMSPER);
+		CS_LOCK(drvdata->base);
+	}
+	spin_unlock(&drvdata->spinlock);
+
+	return size;
+}
+static DEVICE_ATTR_RW(port_select);
+
+static ssize_t port_enable_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+
+	if (!local_read(&drvdata->mode)) {
+		val = drvdata->stmsper;
+	} else {
+		spin_lock(&drvdata->spinlock);
+		val = readl_relaxed(drvdata->base + STMSPER);
+		spin_unlock(&drvdata->spinlock);
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t port_enable_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return ret;
+
+	spin_lock(&drvdata->spinlock);
+	drvdata->stmsper = val;
+
+	if (local_read(&drvdata->mode)) {
+		CS_UNLOCK(drvdata->base);
+		writel_relaxed(drvdata->stmsper, drvdata->base + STMSPER);
+		CS_LOCK(drvdata->base);
+	}
+	spin_unlock(&drvdata->spinlock);
+
+	return size;
+}
+static DEVICE_ATTR_RW(port_enable);
+
+static ssize_t traceid_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	unsigned long val;
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+	val = drvdata->traceid;
+	return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t traceid_store(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t size)
+{
+	int ret;
+	unsigned long val;
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return ret;
+
+	/* traceid field is 7bit wide on STM32 */
+	drvdata->traceid = val & 0x7f;
+	return size;
+}
+static DEVICE_ATTR_RW(traceid);
+
+#define coresight_stm_simple_func(name, offset)	\
+	coresight_simple_func(struct stm_drvdata, name, offset)
+
+coresight_stm_simple_func(tcsr, STMTCSR);
+coresight_stm_simple_func(tsfreqr, STMTSFREQR);
+coresight_stm_simple_func(syncr, STMSYNCR);
+coresight_stm_simple_func(sper, STMSPER);
+coresight_stm_simple_func(spter, STMSPTER);
+coresight_stm_simple_func(privmaskr, STMPRIVMASKR);
+coresight_stm_simple_func(spscr, STMSPSCR);
+coresight_stm_simple_func(spmscr, STMSPMSCR);
+coresight_stm_simple_func(spfeat1r, STMSPFEAT1R);
+coresight_stm_simple_func(spfeat2r, STMSPFEAT2R);
+coresight_stm_simple_func(spfeat3r, STMSPFEAT3R);
+coresight_stm_simple_func(devid, CORESIGHT_DEVID);
+
+static struct attribute *coresight_stm_attrs[] = {
+	&dev_attr_hwevent_enable.attr,
+	&dev_attr_hwevent_select.attr,
+	&dev_attr_port_enable.attr,
+	&dev_attr_port_select.attr,
+	&dev_attr_traceid.attr,
+	NULL,
+};
+
+static struct attribute *coresight_stm_mgmt_attrs[] = {
+	&dev_attr_tcsr.attr,
+	&dev_attr_tsfreqr.attr,
+	&dev_attr_syncr.attr,
+	&dev_attr_sper.attr,
+	&dev_attr_spter.attr,
+	&dev_attr_privmaskr.attr,
+	&dev_attr_spscr.attr,
+	&dev_attr_spmscr.attr,
+	&dev_attr_spfeat1r.attr,
+	&dev_attr_spfeat2r.attr,
+	&dev_attr_spfeat3r.attr,
+	&dev_attr_devid.attr,
+	NULL,
+};
+
+static const struct attribute_group coresight_stm_group = {
+	.attrs = coresight_stm_attrs,
+};
+
+static const struct attribute_group coresight_stm_mgmt_group = {
+	.attrs = coresight_stm_mgmt_attrs,
+	.name = "mgmt",
+};
+
+static const struct attribute_group *coresight_stm_groups[] = {
+	&coresight_stm_group,
+	&coresight_stm_mgmt_group,
+	NULL,
+};
+
+static int stm_get_resource_byname(struct device_node *np,
+				   char *ch_base, struct resource *res)
+{
+	const char *name = NULL;
+	int index = 0, found = 0;
+
+	while (!of_property_read_string_index(np, "reg-names", index, &name)) {
+		if (strcmp(ch_base, name)) {
+			index++;
+			continue;
+		}
+
+		/* We have a match and @index is where it's at */
+		found = 1;
+		break;
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	return of_address_to_resource(np, index, res);
+}
+
+static u32 stm_fundamental_data_size(struct stm_drvdata *drvdata)
+{
+	u32 stmspfeat2r;
+
+	if (!IS_ENABLED(CONFIG_64BIT))
+		return 4;
+
+	stmspfeat2r = readl_relaxed(drvdata->base + STMSPFEAT2R);
+
+	/*
+	 * bit[15:12] represents the fundamental data size
+	 * 0 - 32-bit data
+	 * 1 - 64-bit data
+	 */
+	return BMVAL(stmspfeat2r, 12, 15) ? 8 : 4;
+}
+
+static u32 stm_num_stimulus_port(struct stm_drvdata *drvdata)
+{
+	u32 numsp;
+
+	numsp = readl_relaxed(drvdata->base + CORESIGHT_DEVID);
+	/*
+	 * NUMPS in STMDEVID is 17 bit long and if equal to 0x0,
+	 * 32 stimulus ports are supported.
+	 */
+	numsp &= 0x1ffff;
+	if (!numsp)
+		numsp = STM_32_CHANNEL;
+	return numsp;
+}
+
+static void stm_init_default_data(struct stm_drvdata *drvdata)
+{
+	/* Don't use port selection */
+	drvdata->stmspscr = 0x0;
+	/*
+	 * Enable all channel regardless of their number.  When port
+	 * selection isn't used (see above) STMSPER applies to all
+	 * 32 channel group available, hence setting all 32 bits to 1
+	 */
+	drvdata->stmsper = ~0x0;
+
+	/*
+	 * The trace ID value for *ETM* tracers start at CPU_ID * 2 + 0x10 and
+	 * anything equal to or higher than 0x70 is reserved.  Since 0x00 is
+	 * also reserved the STM trace ID needs to be higher than 0x00 and
+	 * lowner than 0x10.
+	 */
+	drvdata->traceid = 0x1;
+
+	/* Set invariant transaction timing on all channels */
+	bitmap_clear(drvdata->chs.guaranteed, 0, drvdata->numsp);
+}
+
+static void stm_init_generic_data(struct stm_drvdata *drvdata)
+{
+	drvdata->stm.name = dev_name(drvdata->dev);
+
+	/*
+	 * MasterIDs are assigned at HW design phase. As such the core is
+	 * using a single master for interaction with this device.
+	 */
+	drvdata->stm.sw_start = 1;
+	drvdata->stm.sw_end = 1;
+	drvdata->stm.hw_override = true;
+	drvdata->stm.sw_nchannels = drvdata->numsp;
+	drvdata->stm.packet = stm_generic_packet;
+	drvdata->stm.link = stm_generic_link;
+	drvdata->stm.unlink = stm_generic_unlink;
+	drvdata->stm.set_options = stm_generic_set_options;
+}
+
+static int stm_probe(struct amba_device *adev, const struct amba_id *id)
+{
+	int ret;
+	void __iomem *base;
+	unsigned long *guaranteed;
+	struct device *dev = &adev->dev;
+	struct coresight_platform_data *pdata = NULL;
+	struct stm_drvdata *drvdata;
+	struct resource *res = &adev->res;
+	struct resource ch_res;
+	size_t res_size, bitmap_size;
+	struct coresight_desc *desc;
+	struct device_node *np = adev->dev.of_node;
+
+	if (np) {
+		pdata = of_get_coresight_platform_data(dev, np);
+		if (IS_ERR(pdata))
+			return PTR_ERR(pdata);
+		adev->dev.platform_data = pdata;
+	}
+	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
+	if (!drvdata)
+		return -ENOMEM;
+
+	drvdata->dev = &adev->dev;
+	drvdata->atclk = devm_clk_get(&adev->dev, "atclk"); /* optional */
+	if (!IS_ERR(drvdata->atclk)) {
+		ret = clk_prepare_enable(drvdata->atclk);
+		if (ret)
+			return ret;
+	}
+	dev_set_drvdata(dev, drvdata);
+
+	base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+	drvdata->base = base;
+
+	ret = stm_get_resource_byname(np, "stm-stimulus-base", &ch_res);
+	if (ret)
+		return ret;
+
+	base = devm_ioremap_resource(dev, &ch_res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+	drvdata->chs.base = base;
+
+	drvdata->write_bytes = stm_fundamental_data_size(drvdata);
+
+	if (boot_nr_channel) {
+		drvdata->numsp = boot_nr_channel;
+		res_size = min((resource_size_t)(boot_nr_channel *
+				  BYTES_PER_CHANNEL), resource_size(res));
+	} else {
+		drvdata->numsp = stm_num_stimulus_port(drvdata);
+		res_size = min((resource_size_t)(drvdata->numsp *
+				 BYTES_PER_CHANNEL), resource_size(res));
+	}
+	bitmap_size = BITS_TO_LONGS(drvdata->numsp) * sizeof(long);
+
+	guaranteed = devm_kzalloc(dev, bitmap_size, GFP_KERNEL);
+	if (!guaranteed)
+		return -ENOMEM;
+	drvdata->chs.guaranteed = guaranteed;
+
+	spin_lock_init(&drvdata->spinlock);
+
+	stm_init_default_data(drvdata);
+	stm_init_generic_data(drvdata);
+
+	if (stm_register_device(dev, &drvdata->stm, THIS_MODULE)) {
+		dev_info(dev,
+			 "stm_register_device failed, probing deffered\n");
+		return -EPROBE_DEFER;
+	}
+
+	desc = devm_kzalloc(dev, sizeof(*desc), GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto stm_unregister;
+	}
+
+	desc->type = CORESIGHT_DEV_TYPE_SOURCE;
+	desc->subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE;
+	desc->ops = &stm_cs_ops;
+	desc->pdata = pdata;
+	desc->dev = dev;
+	desc->groups = coresight_stm_groups;
+	drvdata->csdev = coresight_register(desc);
+	if (IS_ERR(drvdata->csdev)) {
+		ret = PTR_ERR(drvdata->csdev);
+		goto stm_unregister;
+	}
+
+	pm_runtime_put(&adev->dev);
+
+	dev_info(dev, "%s initialized\n", (char *)id->data);
+	return 0;
+
+stm_unregister:
+	stm_unregister_device(&drvdata->stm);
+	return ret;
+}
+
+#ifdef CONFIG_PM
+static int stm_runtime_suspend(struct device *dev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev);
+
+	if (drvdata && !IS_ERR(drvdata->atclk))
+		clk_disable_unprepare(drvdata->atclk);
+
+	return 0;
+}
+
+static int stm_runtime_resume(struct device *dev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev);
+
+	if (drvdata && !IS_ERR(drvdata->atclk))
+		clk_prepare_enable(drvdata->atclk);
+
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops stm_dev_pm_ops = {
+	SET_RUNTIME_PM_OPS(stm_runtime_suspend, stm_runtime_resume, NULL)
+};
+
+static struct amba_id stm_ids[] = {
+	{
+		.id     = 0x0003b962,
+		.mask   = 0x0003ffff,
+		.data	= "STM32",
+	},
+	{ 0, 0},
+};
+
+static struct amba_driver stm_driver = {
+	.drv = {
+		.name   = "coresight-stm",
+		.owner	= THIS_MODULE,
+		.pm	= &stm_dev_pm_ops,
+		.suppress_bind_attrs = true,
+	},
+	.probe          = stm_probe,
+	.id_table	= stm_ids,
+};
+
+builtin_amba_driver(stm_driver);
diff --git a/include/linux/coresight-stm.h b/include/linux/coresight-stm.h
new file mode 100644
index 000000000000..a978bb85599a
--- /dev/null
+++ b/include/linux/coresight-stm.h
@@ -0,0 +1,6 @@
+#ifndef __LINUX_CORESIGHT_STM_H_
+#define __LINUX_CORESIGHT_STM_H_
+
+#include <uapi/linux/coresight-stm.h>
+
+#endif
diff --git a/include/uapi/linux/coresight-stm.h b/include/uapi/linux/coresight-stm.h
new file mode 100644
index 000000000000..7e4272cf1fb2
--- /dev/null
+++ b/include/uapi/linux/coresight-stm.h
@@ -0,0 +1,21 @@
+#ifndef __UAPI_CORESIGHT_STM_H_
+#define __UAPI_CORESIGHT_STM_H_
+
+#define STM_FLAG_TIMESTAMPED   BIT(3)
+#define STM_FLAG_GUARANTEED    BIT(7)
+
+/*
+ * The CoreSight STM supports guaranteed and invariant timing
+ * transactions.  Guaranteed transactions are guaranteed to be
+ * traced, this might involve stalling the bus or system to
+ * ensure the transaction is accepted by the STM.  While invariant
+ * timing transactions are not guaranteed to be traced, they
+ * will take an invariant amount of time regardless of the
+ * state of the STM.
+ */
+enum {
+	STM_OPTION_GUARANTEED = 0,
+	STM_OPTION_INVARIANT,
+};
+
+#endif
-- 
cgit v1.2.3


From 18d28819809909c3f24bb72183a901c5e332a63d Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Tue, 3 May 2016 09:46:22 +0200
Subject: mcb: Correctly initialize the bus's device

The mcb bus' device member wasn't correctly initialized and thus wasn't placed
correctly into the driver model.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Andreas Werner <andreas.werner@men.de>
Tested-by: Andreas Werner <andreas.werner@men.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mcb/mcb-core.c | 19 ++++++++++++++++---
 include/linux/mcb.h    |  5 ++---
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c
index a4be451074e5..1e336cc56751 100644
--- a/drivers/mcb/mcb-core.c
+++ b/drivers/mcb/mcb-core.c
@@ -187,6 +187,7 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier)
 {
 	struct mcb_bus *bus;
 	int bus_nr;
+	int rc;
 
 	bus = kzalloc(sizeof(struct mcb_bus), GFP_KERNEL);
 	if (!bus)
@@ -194,14 +195,26 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier)
 
 	bus_nr = ida_simple_get(&mcb_ida, 0, 0, GFP_KERNEL);
 	if (bus_nr < 0) {
-		kfree(bus);
-		return ERR_PTR(bus_nr);
+		rc = bus_nr;
+		goto err_free;
 	}
 
-	INIT_LIST_HEAD(&bus->children);
 	bus->bus_nr = bus_nr;
 	bus->carrier = carrier;
+
+	device_initialize(&bus->dev);
+	bus->dev.parent = carrier;
+	bus->dev.bus = &mcb_bus_type;
+
+	dev_set_name(&bus->dev, "mcb:%d", bus_nr);
+	rc = device_add(&bus->dev);
+	if (rc)
+		goto err_free;
+
 	return bus;
+err_free:
+	kfree(bus);
+	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL_GPL(mcb_alloc_bus);
 
diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index ed06e15a36aa..3efafbca166d 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -21,13 +21,12 @@ struct mcb_device;
 /**
  * struct mcb_bus - MEN Chameleon Bus
  *
- * @dev: pointer to carrier device
- * @children: the child busses
+ * @dev: bus device
+ * @carrier: pointer to carrier device
  * @bus_nr: mcb bus number
  * @get_irq: callback to get IRQ number
  */
 struct mcb_bus {
-	struct list_head children;
 	struct device dev;
 	struct device *carrier;
 	int bus_nr;
-- 
cgit v1.2.3


From 803f1ca60d5c0107adfbce4e2d70488598b03a80 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Tue, 3 May 2016 09:46:23 +0200
Subject: mcb: export bus information via sysfs

Export information about the bus stored in the FPGA's header to userspace via
sysfs, instead of hiding it in pr_debug()s from everyone.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Andreas Werner <andreas.werner@men.de>
Tested-by: Andreas Werner <andreas.werner@men.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-mcb | 29 ++++++++++++++++
 drivers/mcb/mcb-core.c                  | 60 +++++++++++++++++++++++++++++++++
 drivers/mcb/mcb-internal.h              |  1 -
 drivers/mcb/mcb-parse.c                 | 15 +++------
 include/linux/mcb.h                     |  9 +++++
 5 files changed, 103 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-mcb

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-mcb b/Documentation/ABI/testing/sysfs-bus-mcb
new file mode 100644
index 000000000000..77947c509796
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-mcb
@@ -0,0 +1,29 @@
+What:		/sys/bus/mcb/devices/mcb:X
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	Hardware chip or device hosting the MEN chameleon bus
+
+What:		/sys/bus/mcb/devices/mcb:X/revision
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's revision number
+
+What:		/sys/bus/mcb/devices/mcb:X/minor
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's minor number
+
+What:		/sys/bus/mcb/devices/mcb:X/model
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's model number
+
+What:		/sys/bus/mcb/devices/mcb:X/name
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's name
diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c
index 1e336cc56751..9ae4d15fc229 100644
--- a/drivers/mcb/mcb-core.c
+++ b/drivers/mcb/mcb-core.c
@@ -90,6 +90,60 @@ static void mcb_shutdown(struct device *dev)
 		mdrv->shutdown(mdev);
 }
 
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", bus->revision);
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t model_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%c\n", bus->model);
+}
+static DEVICE_ATTR_RO(model);
+
+static ssize_t minor_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", bus->minor);
+}
+static DEVICE_ATTR_RO(minor);
+
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", bus->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static struct attribute *mcb_bus_attrs[] = {
+	&dev_attr_revision.attr,
+	&dev_attr_model.attr,
+	&dev_attr_minor.attr,
+	&dev_attr_name.attr,
+	NULL,
+};
+
+static const struct attribute_group mcb_carrier_group = {
+	.attrs = mcb_bus_attrs,
+};
+
+static const struct attribute_group *mcb_carrier_groups[] = {
+	&mcb_carrier_group,
+	NULL,
+};
+
+
 static struct bus_type mcb_bus_type = {
 	.name = "mcb",
 	.match = mcb_match,
@@ -99,6 +153,11 @@ static struct bus_type mcb_bus_type = {
 	.shutdown = mcb_shutdown,
 };
 
+static struct device_type mcb_carrier_device_type = {
+	.name = "mcb-carrier",
+	.groups = mcb_carrier_groups,
+};
+
 /**
  * __mcb_register_driver() - Register a @mcb_driver at the system
  * @drv: The @mcb_driver
@@ -205,6 +264,7 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier)
 	device_initialize(&bus->dev);
 	bus->dev.parent = carrier;
 	bus->dev.bus = &mcb_bus_type;
+	bus->dev.type = &mcb_carrier_device_type;
 
 	dev_set_name(&bus->dev, "mcb:%d", bus_nr);
 	rc = device_add(&bus->dev);
diff --git a/drivers/mcb/mcb-internal.h b/drivers/mcb/mcb-internal.h
index fb7493dcfb79..5254e0285725 100644
--- a/drivers/mcb/mcb-internal.h
+++ b/drivers/mcb/mcb-internal.h
@@ -5,7 +5,6 @@
 
 #define PCI_VENDOR_ID_MEN		0x1a88
 #define PCI_DEVICE_ID_MEN_CHAMELEON	0x4d45
-#define CHAMELEON_FILENAME_LEN		12
 #define CHAMELEONV2_MAGIC		0xabce
 #define CHAM_HEADER_SIZE		0x200
 
diff --git a/drivers/mcb/mcb-parse.c b/drivers/mcb/mcb-parse.c
index 004926955263..35f385b59221 100644
--- a/drivers/mcb/mcb-parse.c
+++ b/drivers/mcb/mcb-parse.c
@@ -113,16 +113,11 @@ int chameleon_parse_cells(struct mcb_bus *bus, phys_addr_t mapbase,
 	}
 	p += hsize;
 
-	pr_debug("header->revision = %d\n", header->revision);
-	pr_debug("header->model = 0x%x ('%c')\n", header->model,
-		header->model);
-	pr_debug("header->minor = %d\n", header->minor);
-	pr_debug("header->bus_type = 0x%x\n", header->bus_type);
-
-
-	pr_debug("header->magic = 0x%x\n", header->magic);
-	pr_debug("header->filename = \"%.*s\"\n", CHAMELEON_FILENAME_LEN,
-		header->filename);
+	bus->revision = header->revision;
+	bus->model = header->model;
+	bus->minor = header->minor;
+	snprintf(bus->name, CHAMELEON_FILENAME_LEN + 1, "%s",
+		 header->filename);
 
 	for_each_chameleon_cell(dtype, p) {
 		switch (dtype) {
diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index 3efafbca166d..ead13d233a97 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -15,6 +15,8 @@
 #include <linux/device.h>
 #include <linux/irqreturn.h>
 
+#define CHAMELEON_FILENAME_LEN 12
+
 struct mcb_driver;
 struct mcb_device;
 
@@ -25,11 +27,18 @@ struct mcb_device;
  * @carrier: pointer to carrier device
  * @bus_nr: mcb bus number
  * @get_irq: callback to get IRQ number
+ * @revision: the FPGA's revision number
+ * @model: the FPGA's model number
+ * @filename: the FPGA's name
  */
 struct mcb_bus {
 	struct device dev;
 	struct device *carrier;
 	int bus_nr;
+	u8 revision;
+	char model;
+	u8 minor;
+	char name[CHAMELEON_FILENAME_LEN + 1];
 	int (*get_irq)(struct mcb_device *dev);
 };
 #define to_mcb_bus(b) container_of((b), struct mcb_bus, dev)
-- 
cgit v1.2.3


From 8320f495cf441d593f7cd4f30e6b63455be71a2c Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 4 May 2016 22:15:27 +0200
Subject: i2c: allow adapter drivers to override the adapter locking

Add i2c_lock_bus() and i2c_unlock_bus(), which call the new lock_bus and
unlock_bus ops in the adapter. These funcs/ops take an additional flags
argument that indicates for what purpose the adapter is locked.

There are two flags, I2C_LOCK_ROOT_ADAPTER and I2C_LOCK_SEGMENT, but they
are both implemented the same. For now. Locking the root adapter means
that the whole bus is locked, locking the segment means that only the
current bus segment is locked (i.e. i2c traffic on the parent side of
a mux is still allowed even if the child side of the mux is locked).

Also support a trylock_bus op (but no function to call it, as it is not
expected to be needed outside of the i2c core).

Implement i2c_lock_adapter/i2c_unlock_adapter in terms of the new locking
scheme (i.e. lock with the I2C_LOCK_ROOT_ADAPTER flag).

Locking the root adapter and locking the segment is the same thing for
all root adapters (e.g. in the normal case of a simple topology with no
i2c muxes). The two locking variants are also the same for traditional
muxes (aka parent-locked muxes). These muxes traverse the tree, locking
each level as they go until they reach the root. This patch is preparatory
for a later patch in the series introducing mux-locked muxes, which behave
differently depending on the requested locking. Since all current users
are using i2c_lock_adapter, which is a wrapper for I2C_LOCK_ROOT_ADAPTER,
we only need to annotate the calls that will not need to lock the root
adapter for mux-locked muxes. I.e. the instances that needs to use
I2C_LOCK_SEGMENT instead of i2c_lock_adapter/I2C_LOCK_ROOT_ADAPTER. Those
instances are in the i2c_transfer and i2c_smbus_xfer functions, so that
mux-locked muxes can single out normal i2c accesses to its slave side
and adjust the locking for those accesses.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 41 +++++++++++++++++++++++++++--------------
 include/linux/i2c.h    | 44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 4979728f7fb2..7ef5bd085476 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -954,10 +954,13 @@ static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
 }
 
 /**
- * i2c_lock_adapter - Get exclusive access to an I2C bus segment
+ * i2c_adapter_lock_bus - Get exclusive access to an I2C bus segment
  * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	locks only this branch in the adapter tree
  */
-void i2c_lock_adapter(struct i2c_adapter *adapter)
+static void i2c_adapter_lock_bus(struct i2c_adapter *adapter,
+				 unsigned int flags)
 {
 	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 
@@ -966,27 +969,32 @@ void i2c_lock_adapter(struct i2c_adapter *adapter)
 	else
 		rt_mutex_lock(&adapter->bus_lock);
 }
-EXPORT_SYMBOL_GPL(i2c_lock_adapter);
 
 /**
- * i2c_trylock_adapter - Try to get exclusive access to an I2C bus segment
+ * i2c_adapter_trylock_bus - Try to get exclusive access to an I2C bus segment
  * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER trylocks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	trylocks only this branch in the adapter tree
  */
-static int i2c_trylock_adapter(struct i2c_adapter *adapter)
+static int i2c_adapter_trylock_bus(struct i2c_adapter *adapter,
+				   unsigned int flags)
 {
 	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 
 	if (parent)
-		return i2c_trylock_adapter(parent);
+		return parent->trylock_bus(parent, flags);
 	else
 		return rt_mutex_trylock(&adapter->bus_lock);
 }
 
 /**
- * i2c_unlock_adapter - Release exclusive access to an I2C bus segment
+ * i2c_adapter_unlock_bus - Release exclusive access to an I2C bus segment
  * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	unlocks only this branch in the adapter tree
  */
-void i2c_unlock_adapter(struct i2c_adapter *adapter)
+static void i2c_adapter_unlock_bus(struct i2c_adapter *adapter,
+				   unsigned int flags)
 {
 	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 
@@ -995,7 +1003,6 @@ void i2c_unlock_adapter(struct i2c_adapter *adapter)
 	else
 		rt_mutex_unlock(&adapter->bus_lock);
 }
-EXPORT_SYMBOL_GPL(i2c_unlock_adapter);
 
 static void i2c_dev_set_name(struct i2c_adapter *adap,
 			     struct i2c_client *client)
@@ -1541,6 +1548,12 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 		return -EINVAL;
 	}
 
+	if (!adap->lock_bus) {
+		adap->lock_bus = i2c_adapter_lock_bus;
+		adap->trylock_bus = i2c_adapter_trylock_bus;
+		adap->unlock_bus = i2c_adapter_unlock_bus;
+	}
+
 	rt_mutex_init(&adap->bus_lock);
 	mutex_init(&adap->userspace_clients_lock);
 	INIT_LIST_HEAD(&adap->userspace_clients);
@@ -2310,16 +2323,16 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 #endif
 
 		if (in_atomic() || irqs_disabled()) {
-			ret = i2c_trylock_adapter(adap);
+			ret = adap->trylock_bus(adap, I2C_LOCK_SEGMENT);
 			if (!ret)
 				/* I2C activity is ongoing. */
 				return -EAGAIN;
 		} else {
-			i2c_lock_adapter(adap);
+			i2c_lock_bus(adap, I2C_LOCK_SEGMENT);
 		}
 
 		ret = __i2c_transfer(adap, msgs, num);
-		i2c_unlock_adapter(adap);
+		i2c_unlock_bus(adap, I2C_LOCK_SEGMENT);
 
 		return ret;
 	} else {
@@ -3094,7 +3107,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC | I2C_CLIENT_SCCB;
 
 	if (adapter->algo->smbus_xfer) {
-		i2c_lock_adapter(adapter);
+		i2c_lock_bus(adapter, I2C_LOCK_SEGMENT);
 
 		/* Retry automatically on arbitration loss */
 		orig_jiffies = jiffies;
@@ -3108,7 +3121,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 				       orig_jiffies + adapter->timeout))
 				break;
 		}
-		i2c_unlock_adapter(adapter);
+		i2c_unlock_bus(adapter, I2C_LOCK_SEGMENT);
 
 		if (res != -EOPNOTSUPP || !adapter->algo->master_xfer)
 			goto trace;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index c30833b7b073..50934d6e1050 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -538,6 +538,10 @@ struct i2c_adapter {
 
 	struct i2c_bus_recovery_info *bus_recovery_info;
 	const struct i2c_adapter_quirks *quirks;
+
+	void (*lock_bus)(struct i2c_adapter *, unsigned int flags);
+	int (*trylock_bus)(struct i2c_adapter *, unsigned int flags);
+	void (*unlock_bus)(struct i2c_adapter *, unsigned int flags);
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
@@ -567,8 +571,44 @@ i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
 int i2c_for_each_dev(void *data, int (*fn)(struct device *, void *));
 
 /* Adapter locking functions, exported for shared pin cases */
-void i2c_lock_adapter(struct i2c_adapter *);
-void i2c_unlock_adapter(struct i2c_adapter *);
+#define I2C_LOCK_ROOT_ADAPTER BIT(0)
+#define I2C_LOCK_SEGMENT      BIT(1)
+
+/**
+ * i2c_lock_bus - Get exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	locks only this branch in the adapter tree
+ */
+static inline void
+i2c_lock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	adapter->lock_bus(adapter, flags);
+}
+
+/**
+ * i2c_unlock_bus - Release exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	unlocks only this branch in the adapter tree
+ */
+static inline void
+i2c_unlock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	adapter->unlock_bus(adapter, flags);
+}
+
+static inline void
+i2c_lock_adapter(struct i2c_adapter *adapter)
+{
+	i2c_lock_bus(adapter, I2C_LOCK_ROOT_ADAPTER);
+}
+
+static inline void
+i2c_unlock_adapter(struct i2c_adapter *adapter)
+{
+	i2c_unlock_bus(adapter, I2C_LOCK_ROOT_ADAPTER);
+}
 
 /*flags for the client struct: */
 #define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
-- 
cgit v1.2.3


From 6ef91fcca8a8ba3df9810a4cc6cd6a9d3f21bf45 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 4 May 2016 22:15:29 +0200
Subject: i2c: mux: relax locking of the top i2c adapter during mux-locked
 muxing

With a i2c topology like the following

                       GPIO ---|  ------ BAT1
                        |      v /
   I2C  -----+----------+---- MUX
             |                   \
           EEPROM                 ------ BAT2

there is a locking problem with the GPIO controller since it is a client
on the same i2c bus that it muxes. Transfers to the mux clients (e.g. BAT1)
will lock the whole i2c bus prior to attempting to switch the mux to the
correct i2c segment. In the above case, the GPIO device is an I/O expander
with an i2c interface, and since the GPIO subsystem knows nothing (and
rightfully so) about the lockless needs of the i2c mux code, this results
in a deadlock when the GPIO driver issues i2c transfers to modify the
mux.

So, observing that while it is needed to have the i2c bus locked during the
actual MUX update in order to avoid random garbage on the slave side, it
is not strictly a must to have it locked over the whole sequence of a full
select-transfer-deselect mux client operation. The mux itself needs to be
locked, so transfers to clients behind the mux are serialized, and the mux
needs to be stable during all i2c traffic (otherwise individual mux slave
segments might see garbage, or worse).

Introduce this new locking concept as "mux-locked" muxes, and call the
pre-existing mux locking scheme "parent-locked".

Modify the i2c mux locking so that muxes that are "mux-locked" locks only
the muxes on the parent adapter instead of the whole i2c bus when there is
a transfer to the slave side of the mux. This lock serializes transfers to
the slave side of the muxes on the parent adapter.

Add code to i2c-mux-gpio and i2c-mux-pinctrl that checks if all involved
gpio/pinctrl devices have a parent that is an i2c adapter in the same
adapter tree that is muxed, and request a "mux-locked mux" if that is the
case.

Modify the select-transfer-deselect code for "mux-locked" muxes so
that each of the select-transfer-deselect ops locks the mux parent
adapter individually.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c              |   1 +
 drivers/i2c/i2c-mux.c               | 152 +++++++++++++++++++++++++++++++++---
 drivers/i2c/muxes/i2c-mux-gpio.c    |  18 +++++
 drivers/i2c/muxes/i2c-mux-pinctrl.c |  38 +++++++++
 include/linux/i2c-mux.h             |   8 ++
 include/linux/i2c.h                 |   1 +
 6 files changed, 205 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index afdee66002db..9da446162529 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1540,6 +1540,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	}
 
 	rt_mutex_init(&adap->bus_lock);
+	rt_mutex_init(&adap->mux_lock);
 	mutex_init(&adap->userspace_clients_lock);
 	INIT_LIST_HEAD(&adap->userspace_clients);
 
diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index 5fa8af715e24..8eee98634cda 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -35,6 +35,25 @@ struct i2c_mux_priv {
 	u32 chan_id;
 };
 
+static int __i2c_mux_master_xfer(struct i2c_adapter *adap,
+				 struct i2c_msg msgs[], int num)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
+	int ret;
+
+	/* Switch to the right mux port and perform the transfer. */
+
+	ret = muxc->select(muxc, priv->chan_id);
+	if (ret >= 0)
+		ret = __i2c_transfer(parent, msgs, num);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
+
+	return ret;
+}
+
 static int i2c_mux_master_xfer(struct i2c_adapter *adap,
 			       struct i2c_msg msgs[], int num)
 {
@@ -47,7 +66,29 @@ static int i2c_mux_master_xfer(struct i2c_adapter *adap,
 
 	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
-		ret = __i2c_transfer(parent, msgs, num);
+		ret = i2c_transfer(parent, msgs, num);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
+
+	return ret;
+}
+
+static int __i2c_mux_smbus_xfer(struct i2c_adapter *adap,
+				u16 addr, unsigned short flags,
+				char read_write, u8 command,
+				int size, union i2c_smbus_data *data)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
+	int ret;
+
+	/* Select the right mux port and perform the transfer. */
+
+	ret = muxc->select(muxc, priv->chan_id);
+	if (ret >= 0)
+		ret = parent->algo->smbus_xfer(parent, addr, flags,
+					read_write, command, size, data);
 	if (muxc->deselect)
 		muxc->deselect(muxc, priv->chan_id);
 
@@ -68,8 +109,8 @@ static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
 
 	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
-		ret = parent->algo->smbus_xfer(parent, addr, flags,
-					read_write, command, size, data);
+		ret = i2c_smbus_xfer(parent, addr, flags,
+				     read_write, command, size, data);
 	if (muxc->deselect)
 		muxc->deselect(muxc, priv->chan_id);
 
@@ -98,13 +139,50 @@ static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
 	return class;
 }
 
+static void i2c_mux_lock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	struct i2c_mux_priv *priv = adapter->algo_data;
+	struct i2c_adapter *parent = priv->muxc->parent;
+
+	rt_mutex_lock(&parent->mux_lock);
+	if (!(flags & I2C_LOCK_ROOT_ADAPTER))
+		return;
+	i2c_lock_bus(parent, flags);
+}
+
+static int i2c_mux_trylock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	struct i2c_mux_priv *priv = adapter->algo_data;
+	struct i2c_adapter *parent = priv->muxc->parent;
+
+	if (!rt_mutex_trylock(&parent->mux_lock))
+		return 0;	/* mux_lock not locked, failure */
+	if (!(flags & I2C_LOCK_ROOT_ADAPTER))
+		return 1;	/* we only want mux_lock, success */
+	if (parent->trylock_bus(parent, flags))
+		return 1;	/* parent locked too, success */
+	rt_mutex_unlock(&parent->mux_lock);
+	return 0;		/* parent not locked, failure */
+}
+
+static void i2c_mux_unlock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	struct i2c_mux_priv *priv = adapter->algo_data;
+	struct i2c_adapter *parent = priv->muxc->parent;
+
+	if (flags & I2C_LOCK_ROOT_ADAPTER)
+		i2c_unlock_bus(parent, flags);
+	rt_mutex_unlock(&parent->mux_lock);
+}
+
 static void i2c_parent_lock_bus(struct i2c_adapter *adapter,
 				unsigned int flags)
 {
 	struct i2c_mux_priv *priv = adapter->algo_data;
 	struct i2c_adapter *parent = priv->muxc->parent;
 
-	parent->lock_bus(parent, flags);
+	rt_mutex_lock(&parent->mux_lock);
+	i2c_lock_bus(parent, flags);
 }
 
 static int i2c_parent_trylock_bus(struct i2c_adapter *adapter,
@@ -113,7 +191,12 @@ static int i2c_parent_trylock_bus(struct i2c_adapter *adapter,
 	struct i2c_mux_priv *priv = adapter->algo_data;
 	struct i2c_adapter *parent = priv->muxc->parent;
 
-	return parent->trylock_bus(parent, flags);
+	if (!rt_mutex_trylock(&parent->mux_lock))
+		return 0;	/* mux_lock not locked, failure */
+	if (parent->trylock_bus(parent, flags))
+		return 1;	/* parent locked too, success */
+	rt_mutex_unlock(&parent->mux_lock);
+	return 0;		/* parent not locked, failure */
 }
 
 static void i2c_parent_unlock_bus(struct i2c_adapter *adapter,
@@ -122,9 +205,36 @@ static void i2c_parent_unlock_bus(struct i2c_adapter *adapter,
 	struct i2c_mux_priv *priv = adapter->algo_data;
 	struct i2c_adapter *parent = priv->muxc->parent;
 
-	parent->unlock_bus(parent, flags);
+	i2c_unlock_bus(parent, flags);
+	rt_mutex_unlock(&parent->mux_lock);
 }
 
+struct i2c_adapter *i2c_root_adapter(struct device *dev)
+{
+	struct device *i2c;
+	struct i2c_adapter *i2c_root;
+
+	/*
+	 * Walk up the device tree to find an i2c adapter, indicating
+	 * that this is an i2c client device. Check all ancestors to
+	 * handle mfd devices etc.
+	 */
+	for (i2c = dev; i2c; i2c = i2c->parent) {
+		if (i2c->type == &i2c_adapter_type)
+			break;
+	}
+	if (!i2c)
+		return NULL;
+
+	/* Continue up the tree to find the root i2c adapter */
+	i2c_root = to_i2c_adapter(i2c);
+	while (i2c_parent_is_i2c_adapter(i2c_root))
+		i2c_root = i2c_parent_is_i2c_adapter(i2c_root);
+
+	return i2c_root;
+}
+EXPORT_SYMBOL_GPL(i2c_root_adapter);
+
 struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 				   struct device *dev, int max_adapters,
 				   int sizeof_priv, u32 flags,
@@ -143,6 +253,8 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 
 	muxc->parent = parent;
 	muxc->dev = dev;
+	if (flags & I2C_MUX_LOCKED)
+		muxc->mux_locked = true;
 	muxc->select = select;
 	muxc->deselect = deselect;
 	muxc->max_adapters = max_adapters;
@@ -176,10 +288,18 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 	/* Need to do algo dynamically because we don't know ahead
 	 * of time what sort of physical adapter we'll be dealing with.
 	 */
-	if (parent->algo->master_xfer)
-		priv->algo.master_xfer = i2c_mux_master_xfer;
-	if (parent->algo->smbus_xfer)
-		priv->algo.smbus_xfer = i2c_mux_smbus_xfer;
+	if (parent->algo->master_xfer) {
+		if (muxc->mux_locked)
+			priv->algo.master_xfer = i2c_mux_master_xfer;
+		else
+			priv->algo.master_xfer = __i2c_mux_master_xfer;
+	}
+	if (parent->algo->smbus_xfer) {
+		if (muxc->mux_locked)
+			priv->algo.smbus_xfer = i2c_mux_smbus_xfer;
+		else
+			priv->algo.smbus_xfer = __i2c_mux_smbus_xfer;
+	}
 	priv->algo.functionality = i2c_mux_functionality;
 
 	/* Now fill out new adapter structure */
@@ -192,9 +312,15 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 	priv->adap.retries = parent->retries;
 	priv->adap.timeout = parent->timeout;
 	priv->adap.quirks = parent->quirks;
-	priv->adap.lock_bus = i2c_parent_lock_bus;
-	priv->adap.trylock_bus = i2c_parent_trylock_bus;
-	priv->adap.unlock_bus = i2c_parent_unlock_bus;
+	if (muxc->mux_locked) {
+		priv->adap.lock_bus = i2c_mux_lock_bus;
+		priv->adap.trylock_bus = i2c_mux_trylock_bus;
+		priv->adap.unlock_bus = i2c_mux_unlock_bus;
+	} else {
+		priv->adap.lock_bus = i2c_parent_lock_bus;
+		priv->adap.trylock_bus = i2c_parent_trylock_bus;
+		priv->adap.unlock_bus = i2c_parent_unlock_bus;
+	}
 
 	/* Sanity check on class */
 	if (i2c_mux_parent_classes(parent) & class)
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index f6270ee934f9..e5cf26eefa97 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
+#include "../../gpio/gpiolib.h"
 #include <linux/of_gpio.h>
 
 struct gpiomux {
@@ -137,6 +138,7 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	struct i2c_mux_core *muxc;
 	struct gpiomux *mux;
 	struct i2c_adapter *parent;
+	struct i2c_adapter *root;
 	unsigned initial_state, gpio_base;
 	int i, ret;
 
@@ -184,6 +186,9 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, muxc);
 
+	root = i2c_root_adapter(&parent->dev);
+
+	muxc->mux_locked = true;
 	mux->gpio_base = gpio_base;
 
 	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
@@ -194,6 +199,9 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < mux->data.n_gpios; i++) {
+		struct device *gpio_dev;
+		struct gpio_desc *gpio_desc;
+
 		ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
 		if (ret) {
 			dev_err(&pdev->dev, "Failed to request GPIO %d\n",
@@ -210,8 +218,18 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 			i++;	/* gpio_request above succeeded, so must free */
 			goto err_request_gpio;
 		}
+
+		if (!muxc->mux_locked)
+			continue;
+
+		gpio_desc = gpio_to_desc(gpio_base + mux->data.gpios[i]);
+		gpio_dev = &gpio_desc->gdev->dev;
+		muxc->mux_locked = i2c_root_adapter(gpio_dev) == root;
 	}
 
+	if (muxc->mux_locked)
+		dev_info(&pdev->dev, "mux-locked i2c mux\n");
+
 	for (i = 0; i < mux->data.n_values; i++) {
 		u32 nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
 		unsigned int class = mux->data.classes ? mux->data.classes[i] : 0;
diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c
index f4e62f4a50cc..35bb775e1b74 100644
--- a/drivers/i2c/muxes/i2c-mux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c
@@ -24,6 +24,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include "../../pinctrl/core.h"
 
 struct i2c_mux_pinctrl {
 	struct i2c_mux_pinctrl_platform_data *pdata;
@@ -120,10 +121,31 @@ static inline int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux,
 }
 #endif
 
+static struct i2c_adapter *i2c_mux_pinctrl_root_adapter(
+	struct pinctrl_state *state)
+{
+	struct i2c_adapter *root = NULL;
+	struct pinctrl_setting *setting;
+	struct i2c_adapter *pin_root;
+
+	list_for_each_entry(setting, &state->settings, node) {
+		pin_root = i2c_root_adapter(setting->pctldev->dev);
+		if (!pin_root)
+			return NULL;
+		if (!root)
+			root = pin_root;
+		else if (root != pin_root)
+			return NULL;
+	}
+
+	return root;
+}
+
 static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 {
 	struct i2c_mux_core *muxc;
 	struct i2c_mux_pinctrl *mux;
+	struct i2c_adapter *root;
 	int i, ret;
 
 	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
@@ -202,6 +224,22 @@ static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 		goto err;
 	}
 
+	root = i2c_root_adapter(&muxc->parent->dev);
+
+	muxc->mux_locked = true;
+	for (i = 0; i < mux->pdata->bus_count; i++) {
+		if (root != i2c_mux_pinctrl_root_adapter(mux->states[i])) {
+			muxc->mux_locked = false;
+			break;
+		}
+	}
+	if (muxc->mux_locked && mux->pdata->pinctrl_state_idle &&
+	    root != i2c_mux_pinctrl_root_adapter(mux->state_idle))
+		muxc->mux_locked = false;
+
+	if (muxc->mux_locked)
+		dev_info(&pdev->dev, "mux-locked i2c mux\n");
+
 	for (i = 0; i < mux->pdata->bus_count; i++) {
 		u32 bus = mux->pdata->base_bus_num ?
 				(mux->pdata->base_bus_num + i) : 0;
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 2fa93fe1345e..d4c1d12f900d 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -27,9 +27,12 @@
 
 #ifdef __KERNEL__
 
+#include <linux/bitops.h>
+
 struct i2c_mux_core {
 	struct i2c_adapter *parent;
 	struct device *dev;
+	bool mux_locked;
 
 	void *priv;
 
@@ -47,11 +50,16 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 				   int (*select)(struct i2c_mux_core *, u32),
 				   int (*deselect)(struct i2c_mux_core *, u32));
 
+/* flags for i2c_mux_alloc */
+#define I2C_MUX_LOCKED BIT(0)
+
 static inline void *i2c_mux_priv(struct i2c_mux_core *muxc)
 {
 	return muxc->priv;
 }
 
+struct i2c_adapter *i2c_root_adapter(struct device *dev);
+
 /*
  * Called to create an i2c bus on a multiplexed bus segment.
  * The chan_id parameter is passed to the select and deselect
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 50934d6e1050..96a25ae14494 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -524,6 +524,7 @@ struct i2c_adapter {
 
 	/* data fields that are valid for all devices	*/
 	struct rt_mutex bus_lock;
+	struct rt_mutex mux_lock;
 
 	int timeout;			/* in jiffies */
 	int retries;
-- 
cgit v1.2.3


From 4d2458507d0b465c62ae80f3e81b8c008ec96b05 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Wed, 4 May 2016 19:33:59 -0300
Subject: ASoC: fsl_sai: Allow setting the SAI MCLK direction

On mx6ul the General Purpose Register 1 (GPR1) contains the following
bits for configuring the direction of the SAI MCLKs:
SAI1_MCLK_DIR, SAI2_MCLK_DIR, SAI3_MCLK_DIR

Introduce  the "fsl,sai-mclk-direction-output" optional property to allow
configuring the SAI_MCLK outputs.

Tested on a imx6ul-evk board.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/fsl-sai.txt |  5 +++++
 include/linux/mfd/syscon/imx6q-iomuxc-gpr.h         |  6 ++++++
 sound/soc/fsl/fsl_sai.c                             | 20 ++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/sound/fsl-sai.txt b/Documentation/devicetree/bindings/sound/fsl-sai.txt
index 777b941d6cbe..740b467adf7d 100644
--- a/Documentation/devicetree/bindings/sound/fsl-sai.txt
+++ b/Documentation/devicetree/bindings/sound/fsl-sai.txt
@@ -48,6 +48,11 @@ Required properties:
 			  receive data by following their own bit clocks and
 			  frame sync clocks separately.
 
+Optional properties (for mx6ul):
+
+  - fsl,sai-mclk-direction-output: This is a boolean property. If present,
+			 indicates that SAI will output the SAI MCLK clock.
+
 Note:
 - If both fsl,sai-asynchronous and fsl,sai-synchronous-rx are absent, the
   default synchronous mode (sync Rx with Tx) will be used, which means both
diff --git a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
index 238c8db953eb..68353822afce 100644
--- a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
+++ b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
@@ -447,5 +447,11 @@
 #define IMX6UL_GPR1_ENET2_CLK_OUTPUT		(0x1 << 18)
 #define IMX6UL_GPR1_ENET_CLK_DIR		(0x3 << 17)
 #define IMX6UL_GPR1_ENET_CLK_OUTPUT		(0x3 << 17)
+#define IMX6UL_GPR1_SAI1_MCLK_DIR		(0x1 << 19)
+#define IMX6UL_GPR1_SAI2_MCLK_DIR		(0x1 << 20)
+#define IMX6UL_GPR1_SAI3_MCLK_DIR		(0x1 << 21)
+#define IMX6UL_GPR1_SAI_MCLK_MASK		(0x7 << 19)
+#define MCLK_DIR(x) (x == 1 ? IMX6UL_GPR1_SAI1_MCLK_DIR : x == 2 ? \
+		     IMX6UL_GPR1_SAI2_MCLK_DIR : IMX6UL_GPR1_SAI3_MCLK_DIR)
 
 #endif /* __LINUX_IMX6Q_IOMUXC_GPR_H */
diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index d8b673f7c577..2147994ab46f 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -21,6 +21,8 @@
 #include <sound/core.h>
 #include <sound/dmaengine_pcm.h>
 #include <sound/pcm_params.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
 
 #include "fsl_sai.h"
 #include "imx-pcm.h"
@@ -786,10 +788,12 @@ static int fsl_sai_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct fsl_sai *sai;
+	struct regmap *gpr;
 	struct resource *res;
 	void __iomem *base;
 	char tmp[8];
 	int irq, ret, i;
+	int index;
 
 	sai = devm_kzalloc(&pdev->dev, sizeof(*sai), GFP_KERNEL);
 	if (!sai)
@@ -878,6 +882,22 @@ static int fsl_sai_probe(struct platform_device *pdev)
 		fsl_sai_dai.symmetric_samplebits = 0;
 	}
 
+	if (of_find_property(np, "fsl,sai-mclk-direction-output", NULL) &&
+	    of_device_is_compatible(pdev->dev.of_node, "fsl,imx6ul-sai")) {
+		gpr = syscon_regmap_lookup_by_compatible("fsl,imx6ul-iomuxc-gpr");
+		if (IS_ERR(gpr)) {
+			dev_err(&pdev->dev, "cannot find iomuxc registers\n");
+			return PTR_ERR(gpr);
+		}
+
+		index = of_alias_get_id(np, "sai");
+		if (index < 0)
+			return index;
+
+		regmap_update_bits(gpr, IOMUXC_GPR1, MCLK_DIR(index),
+				   MCLK_DIR(index));
+	}
+
 	sai->dma_params_rx.addr = res->start + FSL_SAI_RDR;
 	sai->dma_params_tx.addr = res->start + FSL_SAI_TDR;
 	sai->dma_params_rx.maxburst = FSL_SAI_MAXBURST_RX;
-- 
cgit v1.2.3


From e5b2d30e42e66122c9b1b17529df743bc938c041 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:58:11 +0100
Subject: mtd: nand: sharpsl: switch to mtd_ooblayout_ops

Implementing the mtd_ooblayout_ops interface is the new way of exposing
ECC/OOB layout to MTD users.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 arch/arm/mach-pxa/spitz.c   | 55 ++++++++++++++++++++++++++++++++++++---------
 drivers/mtd/nand/sharpsl.c  |  2 +-
 include/linux/mtd/sharpsl.h |  2 +-
 3 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index d9578bc49fdc..bd7cd8b6a286 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -763,14 +763,49 @@ static struct nand_bbt_descr spitz_nand_bbt = {
 	.pattern	= scan_ff_pattern
 };
 
-static struct nand_ecclayout akita_oobinfo = {
-	.oobfree	= { {0x08, 0x09} },
-	.eccbytes	= 24,
-	.eccpos		= {
-			0x05, 0x01, 0x02, 0x03, 0x06, 0x07, 0x15, 0x11,
-			0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
-			0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37,
-	},
+static int akita_ooblayout_ecc(struct mtd_info *mtd, int section,
+			       struct mtd_oob_region *oobregion)
+{
+	if (section > 12)
+		return -ERANGE;
+
+	switch (section % 3) {
+	case 0:
+		oobregion->offset = 5;
+		oobregion->length = 1;
+		break;
+
+	case 1:
+		oobregion->offset = 1;
+		oobregion->length = 3;
+		break;
+
+	case 2:
+		oobregion->offset = 6;
+		oobregion->length = 2;
+		break;
+	}
+
+	oobregion->offset += (section / 3) * 0x10;
+
+	return 0;
+}
+
+static int akita_ooblayout_free(struct mtd_info *mtd, int section,
+				struct mtd_oob_region *oobregion)
+{
+	if (section)
+		return -ERANGE;
+
+	oobregion->offset = 8;
+	oobregion->length = 9;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops akita_ooblayout_ops = {
+	.ecc = akita_ooblayout_ecc,
+	.free = akita_ooblayout_free,
 };
 
 static struct sharpsl_nand_platform_data spitz_nand_pdata = {
@@ -804,11 +839,11 @@ static void __init spitz_nand_init(void)
 	} else if (machine_is_akita()) {
 		spitz_nand_partitions[1].size = 58 * 1024 * 1024;
 		spitz_nand_bbt.len = 1;
-		spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+		spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
 	} else if (machine_is_borzoi()) {
 		spitz_nand_partitions[1].size = 32 * 1024 * 1024;
 		spitz_nand_bbt.len = 1;
-		spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+		spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
 	}
 
 	platform_device_register(&spitz_nand_device);
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index b7d1b55a160b..064ca1757589 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -148,6 +148,7 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
 	/* Link the private data with the MTD structure */
 	mtd = nand_to_mtd(this);
 	mtd->dev.parent = &pdev->dev;
+	mtd_set_ooblayout(mtd, data->ecc_layout);
 
 	platform_set_drvdata(pdev, sharpsl);
 
@@ -170,7 +171,6 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
 	this->ecc.bytes = 3;
 	this->ecc.strength = 1;
 	this->badblock_pattern = data->badblock_pattern;
-	this->ecc.layout = data->ecc_layout;
 	this->ecc.hwctl = sharpsl_nand_enable_hwecc;
 	this->ecc.calculate = sharpsl_nand_calculate_ecc;
 	this->ecc.correct = nand_correct_data;
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
index 25f4d2a845c1..65e91d0fa981 100644
--- a/include/linux/mtd/sharpsl.h
+++ b/include/linux/mtd/sharpsl.h
@@ -14,7 +14,7 @@
 
 struct sharpsl_nand_platform_data {
 	struct nand_bbt_descr	*badblock_pattern;
-	struct nand_ecclayout	*ecc_layout;
+	const struct mtd_ooblayout_ops *ecc_layout;
 	struct mtd_partition	*partitions;
 	unsigned int		nr_partitions;
 };
-- 
cgit v1.2.3


From 04a123a99f089773fc6ee6e21af5f831d87fe362 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 9 Feb 2016 15:01:21 +0100
Subject: mtd: nand: fsmc: get rid of the fsmc_nand_eccplace struct

Now that mtd_ooblayout_ecc() returns the ECC byte position using the
OOB free method, we can get rid of the fsmc_nand_eccplace struct.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/fsmc_nand.c | 60 +++++++++++---------------------------------
 include/linux/mtd/fsmc.h     | 18 -------------
 2 files changed, 15 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 275a98ca4f6a..13720405ec81 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -39,35 +39,6 @@
 #include <linux/amba/bus.h>
 #include <mtd/mtd-abi.h>
 
-/*
- * ECC placement definitions in oobfree type format.
- * There are 13 bytes of ecc for every 512 byte block and it has to be read
- * consecutively and immediately after the 512 byte data block for hardware to
- * generate the error bit offsets in 512 byte data.
- * Managing the ecc bytes in the following way makes it easier for software to
- * read ecc bytes consecutive to data bytes. This way is similar to
- * oobfree structure maintained already in generic nand driver
- */
-static struct fsmc_eccplace fsmc_ecc4_lp_place = {
-	.eccplace = {
-		{.offset = 2, .length = 13},
-		{.offset = 18, .length = 13},
-		{.offset = 34, .length = 13},
-		{.offset = 50, .length = 13},
-		{.offset = 66, .length = 13},
-		{.offset = 82, .length = 13},
-		{.offset = 98, .length = 13},
-		{.offset = 114, .length = 13}
-	}
-};
-
-static struct fsmc_eccplace fsmc_ecc4_sp_place = {
-	.eccplace = {
-		{.offset = 0, .length = 4},
-		{.offset = 6, .length = 9}
-	}
-};
-
 static int fsmc_ecc1_ooblayout_ecc(struct mtd_info *mtd, int section,
 				   struct mtd_oob_region *oobregion)
 {
@@ -105,6 +76,12 @@ static const struct mtd_ooblayout_ops fsmc_ecc1_ooblayout_ops = {
 	.free = fsmc_ecc1_ooblayout_free,
 };
 
+/*
+ * ECC placement definitions in oobfree type format.
+ * There are 13 bytes of ecc for every 512 byte block and it has to be read
+ * consecutively and immediately after the 512 byte data block for hardware to
+ * generate the error bit offsets in 512 byte data.
+ */
 static int fsmc_ecc4_ooblayout_ecc(struct mtd_info *mtd, int section,
 				   struct mtd_oob_region *oobregion)
 {
@@ -155,7 +132,6 @@ static const struct mtd_ooblayout_ops fsmc_ecc4_ooblayout_ops = {
  * @partitions:		Partition info for a NAND Flash.
  * @nr_partitions:	Total number of partition of a NAND flash.
  *
- * @ecc_place:		ECC placing locations in oobfree type format.
  * @bank:		Bank number for probed device.
  * @clk:		Clock structure for FSMC.
  *
@@ -175,7 +151,6 @@ struct fsmc_nand_data {
 	struct mtd_partition	*partitions;
 	unsigned int		nr_partitions;
 
-	struct fsmc_eccplace	*ecc_place;
 	unsigned int		bank;
 	struct device		*dev;
 	enum access_mode	mode;
@@ -582,8 +557,6 @@ static void fsmc_write_buf_dma(struct mtd_info *mtd, const uint8_t *buf,
 static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 				 uint8_t *buf, int oob_required, int page)
 {
-	struct fsmc_nand_data *host = mtd_to_fsmc(mtd);
-	struct fsmc_eccplace *ecc_place = host->ecc_place;
 	int i, j, s, stat, eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
 	int eccsteps = chip->ecc.steps;
@@ -606,9 +579,15 @@ static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 		chip->read_buf(mtd, p, eccsize);
 
 		for (j = 0; j < eccbytes;) {
-			off = ecc_place->eccplace[group].offset;
-			len = ecc_place->eccplace[group].length;
-			group++;
+			struct mtd_oob_region oobregion;
+			int ret;
+
+			ret = mtd_ooblayout_ecc(mtd, group++, &oobregion);
+			if (ret)
+				return ret;
+
+			off = oobregion.offset;
+			len = oobregion.length;
 
 			/*
 			 * length is intentionally kept a higher multiple of 2
@@ -956,19 +935,10 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	if (AMBA_REV_BITS(host->pid) >= 8) {
 		switch (mtd->oobsize) {
 		case 16:
-			host->ecc_place = &fsmc_ecc4_sp_place;
-			break;
 		case 64:
-			host->ecc_place = &fsmc_ecc4_lp_place;
-			break;
 		case 128:
-			host->ecc_place = &fsmc_ecc4_lp_place;
-			break;
 		case 224:
-			host->ecc_place = &fsmc_ecc4_lp_place;
-			break;
 		case 256:
-			host->ecc_place = &fsmc_ecc4_lp_place;
 			break;
 		default:
 			dev_warn(&pdev->dev, "No oob scheme defined for oobsize %d\n",
diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h
index c8be32e9fc49..ad3c3488073c 100644
--- a/include/linux/mtd/fsmc.h
+++ b/include/linux/mtd/fsmc.h
@@ -103,24 +103,6 @@
 
 #define FSMC_BUSY_WAIT_TIMEOUT	(1 * HZ)
 
-/*
- * There are 13 bytes of ecc for every 512 byte block in FSMC version 8
- * and it has to be read consecutively and immediately after the 512
- * byte data block for hardware to generate the error bit offsets
- * Managing the ecc bytes in the following way is easier. This way is
- * similar to oobfree structure maintained already in u-boot nand driver
- */
-#define MAX_ECCPLACE_ENTRIES	32
-
-struct fsmc_nand_eccplace {
-	uint8_t offset;
-	uint8_t length;
-};
-
-struct fsmc_eccplace {
-	struct fsmc_nand_eccplace eccplace[MAX_ECCPLACE_ENTRIES];
-};
-
 struct fsmc_nand_timings {
 	uint8_t tclr;
 	uint8_t tar;
-- 
cgit v1.2.3


From a411679fb5fd7ee2df64a55c23c81538ceeb6d06 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Mon, 7 Dec 2015 22:46:45 +0100
Subject: mtd: onenand: switch to mtd_ooblayout_ops

Implementing the mtd_ooblayout_ops interface is the new way of exposing
ECC/OOB layout to MTD users. Modify the onenand drivers to switch to this
approach.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/onenand/onenand_base.c | 162 ++++++++++++++++++++++---------------
 include/linux/mtd/onenand.h        |   2 -
 2 files changed, 97 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index d0fa505d40bd..a4b029a417f0 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -68,21 +68,33 @@ MODULE_PARM_DESC(otp,	"Corresponding behaviour of OneNAND in OTP"
  * flexonenand_oob_128 - oob info for Flex-Onenand with 4KB page
  * For now, we expose only 64 out of 80 ecc bytes
  */
-static struct nand_ecclayout flexonenand_oob_128 = {
-	.eccbytes	= 64,
-	.eccpos		= {
-		6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-		38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-		54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-		70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-		86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-		102, 103, 104, 105
-		},
-	.oobfree	= {
-		{2, 4}, {18, 4}, {34, 4}, {50, 4},
-		{66, 4}, {82, 4}, {98, 4}, {114, 4}
-	}
+static int flexonenand_ooblayout_ecc(struct mtd_info *mtd, int section,
+				     struct mtd_oob_region *oobregion)
+{
+	if (section > 7)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 6;
+	oobregion->length = 10;
+
+	return 0;
+}
+
+static int flexonenand_ooblayout_free(struct mtd_info *mtd, int section,
+				      struct mtd_oob_region *oobregion)
+{
+	if (section > 7)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 2;
+	oobregion->length = 4;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops flexonenand_ooblayout_ops = {
+	.ecc = flexonenand_ooblayout_ecc,
+	.free = flexonenand_ooblayout_free,
 };
 
 /*
@@ -91,56 +103,77 @@ static struct nand_ecclayout flexonenand_oob_128 = {
  * Based on specification:
  * 4Gb M-die OneNAND Flash (KFM4G16Q4M, KFN8G16Q4M). Rev. 1.3, Apr. 2010
  *
- * For eccpos we expose only 64 bytes out of 72 (see struct nand_ecclayout)
- *
- * oobfree uses the spare area fields marked as
- * "Managed by internal ECC logic for Logical Sector Number area"
- */
-static struct nand_ecclayout onenand_oob_128 = {
-	.eccbytes	= 64,
-	.eccpos		= {
-		7, 8, 9, 10, 11, 12, 13, 14, 15,
-		23, 24, 25, 26, 27, 28, 29, 30, 31,
-		39, 40, 41, 42, 43, 44, 45, 46, 47,
-		55, 56, 57, 58, 59, 60, 61, 62, 63,
-		71, 72, 73, 74, 75, 76, 77, 78, 79,
-		87, 88, 89, 90, 91, 92, 93, 94, 95,
-		103, 104, 105, 106, 107, 108, 109, 110, 111,
-		119
-	},
-	.oobfree	= {
-		{2, 3}, {18, 3}, {34, 3}, {50, 3},
-		{66, 3}, {82, 3}, {98, 3}, {114, 3}
-	}
+ */
+static int onenand_ooblayout_128_ecc(struct mtd_info *mtd, int section,
+				     struct mtd_oob_region *oobregion)
+{
+	if (section > 7)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 7;
+	oobregion->length = 9;
+
+	return 0;
+}
+
+static int onenand_ooblayout_128_free(struct mtd_info *mtd, int section,
+				      struct mtd_oob_region *oobregion)
+{
+	if (section >= 8)
+		return -ERANGE;
+
+	/*
+	 * free bytes are using the spare area fields marked as
+	 * "Managed by internal ECC logic for Logical Sector Number area"
+	 */
+	oobregion->offset = (section * 16) + 2;
+	oobregion->length = 3;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_128_ooblayout_ops = {
+	.ecc = onenand_ooblayout_128_ecc,
+	.free = onenand_ooblayout_128_free,
 };
 
 /**
- * onenand_oob_64 - oob info for large (2KB) page
+ * onenand_oob_32_64 - oob info for large (2KB) page
  */
-static struct nand_ecclayout onenand_oob_64 = {
-	.eccbytes	= 20,
-	.eccpos		= {
-		8, 9, 10, 11, 12,
-		24, 25, 26, 27, 28,
-		40, 41, 42, 43, 44,
-		56, 57, 58, 59, 60,
-		},
-	.oobfree	= {
-		{2, 3}, {14, 2}, {18, 3}, {30, 2},
-		{34, 3}, {46, 2}, {50, 3}, {62, 2}
+static int onenand_ooblayout_32_64_ecc(struct mtd_info *mtd, int section,
+				       struct mtd_oob_region *oobregion)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 8;
+	oobregion->length = 5;
+
+	return 0;
+}
+
+static int onenand_ooblayout_32_64_free(struct mtd_info *mtd, int section,
+					struct mtd_oob_region *oobregion)
+{
+	int sections = (mtd->oobsize / 32) * 2;
+
+	if (section >= sections)
+		return -ERANGE;
+
+	if (section & 1) {
+		oobregion->offset = ((section - 1) * 16) + 14;
+		oobregion->length = 2;
+	} else  {
+		oobregion->offset = (section * 16) + 2;
+		oobregion->length = 3;
 	}
-};
 
-/**
- * onenand_oob_32 - oob info for middle (1KB) page
- */
-static struct nand_ecclayout onenand_oob_32 = {
-	.eccbytes	= 10,
-	.eccpos		= {
-		8, 9, 10, 11, 12,
-		24, 25, 26, 27, 28,
-		},
-	.oobfree	= { {2, 3}, {14, 2}, {18, 3}, {30, 2} }
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_32_64_ooblayout_ops = {
+	.ecc = onenand_ooblayout_32_64_ecc,
+	.free = onenand_ooblayout_32_64_free,
 };
 
 static const unsigned char ffchars[] = {
@@ -3957,22 +3990,22 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	switch (mtd->oobsize) {
 	case 128:
 		if (FLEXONENAND(this)) {
-			this->ecclayout = &flexonenand_oob_128;
+			mtd_set_ooblayout(mtd, &flexonenand_ooblayout_ops);
 			mtd->subpage_sft = 0;
 		} else {
-			this->ecclayout = &onenand_oob_128;
+			mtd_set_ooblayout(mtd, &onenand_oob_128_ooblayout_ops);
 			mtd->subpage_sft = 2;
 		}
 		if (ONENAND_IS_NOP_1(this))
 			mtd->subpage_sft = 0;
 		break;
 	case 64:
-		this->ecclayout = &onenand_oob_64;
+		mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
 		mtd->subpage_sft = 2;
 		break;
 
 	case 32:
-		this->ecclayout = &onenand_oob_32;
+		mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
 		mtd->subpage_sft = 1;
 		break;
 
@@ -3981,7 +4014,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 			__func__, mtd->oobsize);
 		mtd->subpage_sft = 0;
 		/* To prevent kernel oops */
-		this->ecclayout = &onenand_oob_32;
+		mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
 		break;
 	}
 
@@ -3997,7 +4030,6 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 
 	mtd->oobavail = ret;
 
-	mtd_set_ecclayout(mtd, this->ecclayout);
 	mtd->ecc_strength = 1;
 
 	/* Fill in remaining MTD driver data */
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 4596503c9da9..0aaa98b219a4 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -80,7 +80,6 @@ struct onenand_bufferram {
  * @page_buf:		[INTERN] page main data buffer
  * @oob_buf:		[INTERN] page oob data buffer
  * @subpagesize:	[INTERN] holds the subpagesize
- * @ecclayout:		[REPLACEABLE] the default ecc placement scheme
  * @bbm:		[REPLACEABLE] pointer to Bad Block Management
  * @priv:		[OPTIONAL] pointer to private chip date
  */
@@ -134,7 +133,6 @@ struct onenand_chip {
 #endif
 
 	int			subpagesize;
-	struct nand_ecclayout	*ecclayout;
 
 	void			*bbm;
 
-- 
cgit v1.2.3


From 7f2b092c9eeda055ae60af194a8edacaea5f7a10 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:24:10 +0100
Subject: mtd: nand: kill the ecc->layout field

Now that all NAND drivers have switched to mtd_ooblayout_ops, we can kill
the ecc->layout field.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 7 -------
 drivers/mtd/nand/nand_bch.c  | 8 --------
 include/linux/mtd/nand.h     | 2 --
 3 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 6c84a40dbddb..0001be19b90c 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4151,13 +4151,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Set the internal oob buffer location, just after the page data */
 	chip->oob_poi = chip->buffers->databuf + mtd->writesize;
 
-	/*
-	 * Set the provided ECC layout. If ecc->layout is NULL, the MTD core
-	 * will just leave mtd->ooblayout to NULL, if it's not NULL, it will
-	 * set ->ooblayout to the default ecclayout wrapper.
-	 */
-	mtd_set_ecclayout(mtd, ecc->layout);
-
 	/*
 	 * If no default placement scheme is given, select an appropriate one.
 	 */
diff --git a/drivers/mtd/nand/nand_bch.c b/drivers/mtd/nand/nand_bch.c
index 1dbc418baac4..28ef7b178eac 100644
--- a/drivers/mtd/nand/nand_bch.c
+++ b/drivers/mtd/nand/nand_bch.c
@@ -158,14 +158,6 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
 	eccsteps = mtd->writesize/eccsize;
 
-	/*
-	 * Rely on the default ecclayout to ooblayout wrapper provided by MTD
-	 * core if ecc.layout is not NULL.
-	 * FIXME: this should be removed when all callers have moved to the
-	 * mtd_ooblayout_ops approach.
-	 */
-	mtd_set_ecclayout(mtd, nand->ecc.layout);
-
 	/* Check that we have an oob layout description. */
 	if (!mtd->ooblayout) {
 		pr_warn("missing oob scheme");
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index f2ded7b1b3b8..e851839daf09 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -473,7 +473,6 @@ struct nand_hw_control {
  * @prepad:	padding information for syndrome based ECC generators
  * @postpad:	padding information for syndrome based ECC generators
  * @options:	ECC specific options (see NAND_ECC_XXX flags defined above)
- * @layout:	ECC layout control struct pointer
  * @priv:	pointer to private ECC control data
  * @hwctl:	function to control hardware ECC generator. Must only
  *		be provided if an hardware ECC is available
@@ -524,7 +523,6 @@ struct nand_ecc_ctrl {
 	int prepad;
 	int postpad;
 	unsigned int options;
-	struct nand_ecclayout	*layout;
 	void *priv;
 	void (*hwctl)(struct mtd_info *mtd, int mode);
 	int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
-- 
cgit v1.2.3


From aab616e31d1c7ec3726f7f5cbdaaec98759ebe93 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 4 Feb 2016 10:16:18 +0100
Subject: mtd: kill the nand_ecclayout struct

Now that all MTD drivers have moved to the mtd_ooblayout_ops model we can
safely remove the struct nand_ecclayout definition, and all the remaining
places where it was still used.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/mtdchar.c      |  12 ++---
 drivers/mtd/mtdcore.c      | 117 ---------------------------------------------
 include/linux/mtd/mtd.h    |  20 --------
 include/uapi/mtd/mtd-abi.h |   2 +-
 4 files changed, 7 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 3fad2c7425b0..2a47a3f0e730 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -465,12 +465,12 @@ static int mtdchar_readoob(struct file *file, struct mtd_info *mtd,
 }
 
 /*
- * Copies (and truncates, if necessary) data from the larger struct,
- * nand_ecclayout, to the smaller, deprecated layout struct,
- * nand_ecclayout_user. This is necessary only to support the deprecated
- * API ioctl ECCGETLAYOUT while allowing all new functionality to use
- * nand_ecclayout flexibly (i.e. the struct may change size in new
- * releases without requiring major rewrites).
+ * Copies (and truncates, if necessary) OOB layout information to the
+ * deprecated layout struct, nand_ecclayout_user. This is necessary only to
+ * support the deprecated API ioctl ECCGETLAYOUT while allowing all new
+ * functionality to use mtd_ooblayout_ops flexibly (i.e. mtd_ooblayout_ops
+ * can describe any kind of OOB layout with almost zero overhead from a
+ * memory usage point of view).
  */
 static int shrink_ecclayout(struct mtd_info *mtd,
 			    struct nand_ecclayout_user *to)
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 134ed2f7b919..e3936b847c6b 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1376,123 +1376,6 @@ int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
 
-/**
- * mtd_ecclayout_ecc - Default ooblayout_ecc iterator implementation
- * @mtd: MTD device structure
- * @section: ECC section. Depending on the layout you may have all the ECC
- *	     bytes stored in a single contiguous section, or one section
- *	     per ECC chunk (and sometime several sections for a single ECC
- *	     ECC chunk)
- * @oobecc: OOB region struct filled with the appropriate ECC position
- *	    information
- *
- * This function is just a wrapper around the mtd->ecclayout field and is
- * here to ease the transition to the mtd_ooblayout_ops approach.
- * All it does is convert the layout->eccpos information into proper oob
- * region definitions.
- *
- * Returns zero on success, a negative error code otherwise.
- */
-static int mtd_ecclayout_ecc(struct mtd_info *mtd, int section,
-			     struct mtd_oob_region *oobecc)
-{
-	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
-
-	if (!mtd->ecclayout)
-		return -ENOTSUPP;
-
-	/*
-	 * This logic allows us to reuse the ->ecclayout information and
-	 * expose them as ECC regions (as done for the OOB free regions).
-	 *
-	 * TODO: this should be dropped as soon as we get rid of the
-	 * ->ecclayout field.
-	 */
-	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
-		eccpos = mtd->ecclayout->eccpos[eccbyte];
-
-		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
-			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
-
-			if (eccpos + 1 == neccpos) {
-				length++;
-				continue;
-			}
-		}
-
-		if (section == cursection)
-			break;
-
-		length = 0;
-		cursection++;
-	}
-
-	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
-		return -ERANGE;
-
-	oobecc->length = length + 1;
-	oobecc->offset = eccpos - length;
-
-	return 0;
-}
-
-/**
- * mtd_ecclayout_ecc - Default ooblayout_free iterator implementation
- * @mtd: MTD device structure
- * @section: Free section. Depending on the layout you may have all the free
- *	     bytes stored in a single contiguous section, or one section
- *	     per ECC chunk (and sometime several sections for a single ECC
- *	     ECC chunk)
- * @oobfree: OOB region struct filled with the appropriate free position
- *	     information
- *
- * This function is just a wrapper around the mtd->ecclayout field and is
- * here to ease the transition to the mtd_ooblayout_ops approach.
- * All it does is convert the layout->oobfree information into proper oob
- * region definitions.
- *
- * Returns zero on success, a negative error code otherwise.
- */
-static int mtd_ecclayout_free(struct mtd_info *mtd, int section,
-			      struct mtd_oob_region *oobfree)
-{
-	struct nand_ecclayout *layout = mtd->ecclayout;
-
-	if (!layout)
-		return -ENOTSUPP;
-
-	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE ||
-	    !layout->oobfree[section].length)
-		return -ERANGE;
-
-	oobfree->offset = layout->oobfree[section].offset;
-	oobfree->length = layout->oobfree[section].length;
-
-	return 0;
-}
-
-static const struct mtd_ooblayout_ops mtd_ecclayout_wrapper_ops = {
-	.ecc = mtd_ecclayout_ecc,
-	.free = mtd_ecclayout_free,
-};
-
-/**
- * mtd_set_ecclayout - Attach an ecclayout to an MTD device
- * @mtd: MTD device structure
- * @ecclayout: The ecclayout to attach to the device
- *
- * Returns zero on success, a negative error code otherwise.
- */
-void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout)
-{
-	if (!mtd || !ecclayout)
-		return;
-
-	mtd->ecclayout = ecclayout;
-	mtd_set_ooblayout(mtd, &mtd_ecclayout_wrapper_ops);
-}
-EXPORT_SYMBOL_GPL(mtd_set_ecclayout);
-
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 177bf314ad70..29a170612203 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -96,21 +96,6 @@ struct mtd_oob_ops {
 
 #define MTD_MAX_OOBFREE_ENTRIES_LARGE	32
 #define MTD_MAX_ECCPOS_ENTRIES_LARGE	640
-/*
- * Internal ECC layout control structure. For historical reasons, there is a
- * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
- * for export to user-space via the ECCGETLAYOUT ioctl.
- * nand_ecclayout should be expandable in the future simply by the above macros.
- *
- * This structure is now deprecated, you should use struct nand_ecclayout_ops
- * to describe your OOB layout.
- */
-struct nand_ecclayout {
-	__u32 eccbytes;
-	__u32 eccpos[MTD_MAX_ECCPOS_ENTRIES_LARGE];
-	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES_LARGE];
-};
-
 /**
  * struct mtd_oob_region - oob region definition
  * @offset: region offset
@@ -200,9 +185,6 @@ struct mtd_info {
 	const char *name;
 	int index;
 
-	/* [Deprecated] ECC layout structure pointer - read only! */
-	struct nand_ecclayout *ecclayout;
-
 	/* OOB layout description */
 	const struct mtd_ooblayout_ops *ooblayout;
 
@@ -308,8 +290,6 @@ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
 int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
 int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
 
-void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout);
-
 static inline void mtd_set_ooblayout(struct mtd_info *mtd,
 				     const struct mtd_ooblayout_ops *ooblayout)
 {
diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h
index 763bb6950402..0ec1da2ef652 100644
--- a/include/uapi/mtd/mtd-abi.h
+++ b/include/uapi/mtd/mtd-abi.h
@@ -228,7 +228,7 @@ struct nand_oobfree {
  * complete set of ECC information. The ioctl truncates the larger internal
  * structure to retain binary compatibility with the static declaration of the
  * ioctl. Note that the "MTD_MAX_..._ENTRIES" macros represent the max size of
- * the user struct, not the MAX size of the internal struct nand_ecclayout.
+ * the user struct, not the MAX size of the internal OOB layout representation.
  */
 struct nand_ecclayout_user {
 	__u32 eccbytes;
-- 
cgit v1.2.3


From d48f62b9a0a035d6c16de4a4dae315f7332a8939 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Fri, 1 Apr 2016 14:54:32 +0200
Subject: mtd: nand: move of_get_nand_xxx() helpers into nand_base.c

Now that all drivers go through nand_set_flash_node() to parse the generic
NAND properties, we can move all of_get_nand_xxx() helpers in to
nand_base.c, make them static and remove of_mtd.c and of_mtd.h.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c |  94 +++++++++++++++++++++++++-
 drivers/of/Makefile          |   1 -
 drivers/of/of_mtd.c          | 155 -------------------------------------------
 include/linux/of_mtd.h       |  56 ----------------
 4 files changed, 93 insertions(+), 213 deletions(-)
 delete mode 100644 drivers/of/of_mtd.c
 delete mode 100644 include/linux/of_mtd.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0001be19b90c..9f157bb3faa2 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -45,7 +45,7 @@
 #include <linux/bitops.h>
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
-#include <linux/of_mtd.h>
+#include <linux/of.h>
 
 static int nand_get_device(struct mtd_info *mtd, int new_state);
 
@@ -3971,6 +3971,98 @@ ident_done:
 	return type;
 }
 
+static const char * const nand_ecc_modes[] = {
+	[NAND_ECC_NONE]		= "none",
+	[NAND_ECC_SOFT]		= "soft",
+	[NAND_ECC_HW]		= "hw",
+	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
+	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
+	[NAND_ECC_SOFT_BCH]	= "soft_bch",
+};
+
+static int of_get_nand_ecc_mode(struct device_node *np)
+{
+	const char *pm;
+	int err, i;
+
+	err = of_property_read_string(np, "nand-ecc-mode", &pm);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
+		if (!strcasecmp(pm, nand_ecc_modes[i]))
+			return i;
+
+	return -ENODEV;
+}
+
+static int of_get_nand_ecc_algo(struct device_node *np)
+{
+	const char *pm;
+	int err;
+
+	/*
+	 * TODO: Read ECC algo OF property and map it to enum nand_ecc_algo.
+	 * It's not implemented yet as currently NAND subsystem ignores
+	 * algorithm explicitly set this way. Once it's handled we should
+	 * document & support new property.
+	 */
+
+	/*
+	 * For backward compatibility we also read "nand-ecc-mode" checking
+	 * for some obsoleted values that were specifying ECC algorithm.
+	 */
+	err = of_property_read_string(np, "nand-ecc-mode", &pm);
+	if (err < 0)
+		return err;
+
+	if (!strcasecmp(pm, "soft"))
+		return NAND_ECC_HAMMING;
+	else if (!strcasecmp(pm, "soft_bch"))
+		return NAND_ECC_BCH;
+
+	return -ENODEV;
+}
+
+static int of_get_nand_ecc_step_size(struct device_node *np)
+{
+	int ret;
+	u32 val;
+
+	ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
+	return ret ? ret : val;
+}
+
+static int of_get_nand_ecc_strength(struct device_node *np)
+{
+	int ret;
+	u32 val;
+
+	ret = of_property_read_u32(np, "nand-ecc-strength", &val);
+	return ret ? ret : val;
+}
+
+static int of_get_nand_bus_width(struct device_node *np)
+{
+	u32 val;
+
+	if (of_property_read_u32(np, "nand-bus-width", &val))
+		return 8;
+
+	switch (val) {
+	case 8:
+	case 16:
+		return val;
+	default:
+		return -EIO;
+	}
+}
+
+static bool of_get_nand_on_flash_bbt(struct device_node *np)
+{
+	return of_property_read_bool(np, "nand-on-flash-bbt");
+}
+
 static int nand_dt_init(struct nand_chip *chip)
 {
 	struct device_node *dn = nand_get_flash_node(chip);
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 156c072b3117..e31bdc123c6e 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_OF_UNITTEST) += unittest.o
 obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
 obj-$(CONFIG_OF_PCI)	+= of_pci.o
 obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
-obj-$(CONFIG_OF_MTD)	+= of_mtd.o
 obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
deleted file mode 100644
index 15d056e181d2..000000000000
--- a/drivers/of/of_mtd.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- *
- */
-#include <linux/kernel.h>
-#include <linux/of_mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/export.h>
-
-/**
- * It maps 'enum nand_ecc_modes_t' found in include/linux/mtd/nand.h
- * into the device tree binding of 'nand-ecc', so that MTD
- * device driver can get nand ecc from device tree.
- */
-static const char *nand_ecc_modes[] = {
-	[NAND_ECC_NONE]		= "none",
-	[NAND_ECC_SOFT]		= "soft",
-	[NAND_ECC_HW]		= "hw",
-	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
-	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
-	[NAND_ECC_SOFT_BCH]	= "soft_bch",
-};
-
-/**
- * of_get_nand_ecc_mode - Get nand ecc mode for given device_node
- * @np:	Pointer to the given device_node
- *
- * The function gets ecc mode string from property 'nand-ecc-mode',
- * and return its index in nand_ecc_modes table, or errno in error case.
- */
-int of_get_nand_ecc_mode(struct device_node *np)
-{
-	const char *pm;
-	int err, i;
-
-	err = of_property_read_string(np, "nand-ecc-mode", &pm);
-	if (err < 0)
-		return err;
-
-	for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
-		if (!strcasecmp(pm, nand_ecc_modes[i]))
-			return i;
-
-	return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_mode);
-
-/**
- * of_get_nand_ecc_algo - Get nand ecc algorithm for given device_node
- * @np:	Pointer to the given device_node
- *
- * The function gets ecc algorithm and returns its enum value, or errno in error
- * case.
- */
-int of_get_nand_ecc_algo(struct device_node *np)
-{
-	const char *pm;
-	int err;
-
-	/*
-	 * TODO: Read ECC algo OF property and map it to enum nand_ecc_algo.
-	 * It's not implemented yet as currently NAND subsystem ignores
-	 * algorithm explicitly set this way. Once it's handled we should
-	 * document & support new property.
-	 */
-
-	/*
-	 * For backward compatibility we also read "nand-ecc-mode" checking
-	 * for some obsoleted values that were specifying ECC algorithm.
-	 */
-	err = of_property_read_string(np, "nand-ecc-mode", &pm);
-	if (err < 0)
-		return err;
-
-	if (!strcasecmp(pm, "soft"))
-		return NAND_ECC_HAMMING;
-	else if (!strcasecmp(pm, "soft_bch"))
-		return NAND_ECC_BCH;
-
-	return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_algo);
-
-/**
- * of_get_nand_ecc_step_size - Get ECC step size associated to
- * the required ECC strength (see below).
- * @np:	Pointer to the given device_node
- *
- * return the ECC step size, or errno in error case.
- */
-int of_get_nand_ecc_step_size(struct device_node *np)
-{
-	int ret;
-	u32 val;
-
-	ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
-	return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_step_size);
-
-/**
- * of_get_nand_ecc_strength - Get required ECC strength over the
- * correspnding step size as defined by 'nand-ecc-size'
- * @np:	Pointer to the given device_node
- *
- * return the ECC strength, or errno in error case.
- */
-int of_get_nand_ecc_strength(struct device_node *np)
-{
-	int ret;
-	u32 val;
-
-	ret = of_property_read_u32(np, "nand-ecc-strength", &val);
-	return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_strength);
-
-/**
- * of_get_nand_bus_width - Get nand bus witdh for given device_node
- * @np:	Pointer to the given device_node
- *
- * return bus width option, or errno in error case.
- */
-int of_get_nand_bus_width(struct device_node *np)
-{
-	u32 val;
-
-	if (of_property_read_u32(np, "nand-bus-width", &val))
-		return 8;
-
-	switch(val) {
-	case 8:
-	case 16:
-		return val;
-	default:
-		return -EIO;
-	}
-}
-EXPORT_SYMBOL_GPL(of_get_nand_bus_width);
-
-/**
- * of_get_nand_on_flash_bbt - Get nand on flash bbt for given device_node
- * @np:	Pointer to the given device_node
- *
- * return true if present false other wise
- */
-bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-	return of_property_read_bool(np, "nand-on-flash-bbt");
-}
-EXPORT_SYMBOL_GPL(of_get_nand_on_flash_bbt);
diff --git a/include/linux/of_mtd.h b/include/linux/of_mtd.h
deleted file mode 100644
index 0f6aca5c6f2f..000000000000
--- a/include/linux/of_mtd.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- */
-
-#ifndef __LINUX_OF_MTD_H
-#define __LINUX_OF_MTD_H
-
-#ifdef CONFIG_OF_MTD
-
-#include <linux/of.h>
-int of_get_nand_ecc_mode(struct device_node *np);
-int of_get_nand_ecc_algo(struct device_node *np);
-int of_get_nand_ecc_step_size(struct device_node *np);
-int of_get_nand_ecc_strength(struct device_node *np);
-int of_get_nand_bus_width(struct device_node *np);
-bool of_get_nand_on_flash_bbt(struct device_node *np);
-
-#else /* CONFIG_OF_MTD */
-
-static inline int of_get_nand_ecc_mode(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_algo(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_step_size(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_strength(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_bus_width(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-	return false;
-}
-
-#endif /* CONFIG_OF_MTD */
-
-#endif /* __LINUX_OF_MTD_H */
-- 
cgit v1.2.3


From e4225ae8234cf5548c38dc887b233ad1d45b4d53 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 17 Apr 2016 22:53:07 +0200
Subject: mtd: mtd: drop NAND_ECC_SOFT_BCH enum value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This value should not be part of nand_ecc_modes_t as it specifies
algorithm not a mode. We successfully managed to introduce new "algo"
field which is respected now.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/fsmc_nand.c   |  3 +--
 drivers/mtd/nand/jz4780_nand.c |  1 -
 drivers/mtd/nand/mxc_nand.c    |  1 -
 drivers/mtd/nand/nand_base.c   | 11 +++--------
 drivers/mtd/nand/nandsim.c     |  2 +-
 drivers/mtd/nand/sunxi_nand.c  |  2 --
 include/linux/mtd/nand.h       |  1 -
 7 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 0f8c63f85f86..d4f454a4b35e 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -959,7 +959,6 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 			break;
 
 		case NAND_ECC_SOFT:
-		case NAND_ECC_SOFT_BCH:
 			if (nand->ecc.algo == NAND_ECC_BCH) {
 				dev_info(&pdev->dev, "Using 4-bit SW BCH ECC scheme\n");
 				break;
@@ -974,7 +973,7 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 		 * Don't set layout for BCH4 SW ECC. This will be
 		 * generated later in nand_bch_init() later.
 		 */
-		if (nand->ecc.mode != NAND_ECC_SOFT_BCH) {
+		if (nand->ecc.mode == NAND_ECC_HW) {
 			switch (mtd->oobsize) {
 			case 16:
 			case 64:
diff --git a/drivers/mtd/nand/jz4780_nand.c b/drivers/mtd/nand/jz4780_nand.c
index 10f249ac148c..daf3c4217f4d 100644
--- a/drivers/mtd/nand/jz4780_nand.c
+++ b/drivers/mtd/nand/jz4780_nand.c
@@ -179,7 +179,6 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
 		chip->ecc.correct = jz4780_nand_ecc_correct;
 		/* fall through */
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		dev_info(dev, "using %s (strength %d, size %d, bytes %d)\n",
 			(nfc->bch) ? "hardware BCH" : "software ECC",
 			chip->ecc.strength, chip->ecc.size, chip->ecc.bytes);
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index aa0ba4b99395..5173fadc9a4e 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -1625,7 +1625,6 @@ static int mxcnd_probe(struct platform_device *pdev)
 		break;
 
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		break;
 
 	default:
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 1d74b802aa0a..c9d6230eab08 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4183,8 +4183,7 @@ static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-	if (WARN_ON(ecc->mode != NAND_ECC_SOFT &&
-		    ecc->mode != NAND_ECC_SOFT_BCH))
+	if (WARN_ON(ecc->mode != NAND_ECC_SOFT))
 		return -EINVAL;
 
 	switch (ecc->algo) {
@@ -4331,8 +4330,7 @@ int nand_scan_tail(struct mtd_info *mtd)
 	 * If no default placement scheme is given, select an appropriate one.
 	 */
 	if (!mtd->ooblayout &&
-	    !((ecc->mode == NAND_ECC_SOFT || ecc->mode == NAND_ECC_SOFT_BCH) &&
-	       ecc->algo == NAND_ECC_BCH)) {
+	    !(ecc->mode == NAND_ECC_SOFT && ecc->algo == NAND_ECC_BCH)) {
 		switch (mtd->oobsize) {
 		case 8:
 		case 16:
@@ -4426,7 +4424,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 		ecc->algo = NAND_ECC_HAMMING;
 
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		ret = nand_set_ecc_soft_ops(mtd);
 		if (ret) {
 			ret = -EINVAL;
@@ -4514,7 +4511,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Large page NAND with SOFT_ECC should support subpage reads */
 	switch (ecc->mode) {
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		if (chip->page_shift > 9)
 			chip->options |= NAND_SUBPAGE_READ;
 		break;
@@ -4614,8 +4610,7 @@ void nand_release(struct mtd_info *mtd)
 {
 	struct nand_chip *chip = mtd_to_nand(mtd);
 
-	if ((chip->ecc.mode == NAND_ECC_SOFT ||
-	     chip->ecc.mode == NAND_ECC_SOFT_BCH) &&
+	if (chip->ecc.mode == NAND_ECC_SOFT &&
 	    chip->ecc.algo == NAND_ECC_BCH)
 		nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
 
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 66eebb9e77f3..794745dff7f1 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -2339,7 +2339,7 @@ static int __init ns_init_module(void)
 			retval = -EINVAL;
 			goto error;
 		}
-		chip->ecc.mode = NAND_ECC_SOFT_BCH;
+		chip->ecc.mode = NAND_ECC_SOFT;
 		chip->ecc.algo = NAND_ECC_BCH;
 		chip->ecc.size = 512;
 		chip->ecc.strength = bch;
diff --git a/drivers/mtd/nand/sunxi_nand.c b/drivers/mtd/nand/sunxi_nand.c
index 1baf89836210..a83a690688b4 100644
--- a/drivers/mtd/nand/sunxi_nand.c
+++ b/drivers/mtd/nand/sunxi_nand.c
@@ -1612,8 +1612,6 @@ static int sunxi_nand_ecc_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc,
 		return -EINVAL;
 
 	switch (ecc->mode) {
-	case NAND_ECC_SOFT_BCH:
-		break;
 	case NAND_ECC_HW:
 		ret = sunxi_nand_hw_ecc_ctrl_init(mtd, ecc, np);
 		if (ret)
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index e851839daf09..fbe8e164a4ee 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -116,7 +116,6 @@ typedef enum {
 	NAND_ECC_HW,
 	NAND_ECC_HW_SYNDROME,
 	NAND_ECC_HW_OOB_FIRST,
-	NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
 enum nand_ecc_algo {
-- 
cgit v1.2.3


From 466c3fb618b8520b75be37fcb115e9610663b945 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Thu, 5 May 2016 22:35:54 -0400
Subject: jbd2: remove excess descriptions for handle_s

Commit bf6993276f74 ("jbd2: Use tracepoints for history file")
removed the members j_history, j_history_max and j_history_cur from struct
handle_s but the descriptions stayed lingering. Removing them.

Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/jbd2.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 39511484ad10..efb232c5f668 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -789,9 +789,6 @@ jbd2_time_diff(unsigned long start, unsigned long end)
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *	number that will fit in j_blocksize
  * @j_last_sync_writer: most recent pid which did a synchronous write
- * @j_history: Buffer storing the transactions statistics history
- * @j_history_max: Maximum number of transactions in the statistics history
- * @j_history_cur: Current number of transactions in the statistics history
  * @j_history_lock: Protect the transactions statistics history
  * @j_proc_entry: procfs entry for the jbd statistics directory
  * @j_stats: Overall statistics
-- 
cgit v1.2.3


From 6f3ffc19157a14b182d9d0c449cd613cef421fe1 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Tue, 12 Jan 2016 18:17:19 +0100
Subject: timer: add setup_deferrable_timer macro

Add the trivial missing macro to setup a deferrable timer.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 61aa61dc410c..20ac746f3eb3 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -145,6 +145,8 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define setup_timer(timer, fn, data)					\
 	__setup_timer((timer), (fn), (data), 0)
+#define setup_deferrable_timer(timer, fn, data)				\
+	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
 #define setup_timer_on_stack(timer, fn, data)				\
 	__setup_timer_on_stack((timer), (fn), (data), 0)
 #define setup_deferrable_timer_on_stack(timer, fn, data)		\
-- 
cgit v1.2.3


From b5a7aef1ef436ec005fef0efe31a676ec5f4ab31 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 4 May 2016 22:05:01 -0700
Subject: fscrypto/f2fs: allow fs-specific key prefix for fs encryption

This patch allows fscrypto to handle a second key prefix given by filesystem.
The main reason is to provide backward compatibility, since previously f2fs
used "f2fs:" as a crypto prefix instead of "fscrypt:".
Later, ext4 should also provide key_prefix() to give "ext4:".

One concern decribed by Ted would be kinda double check overhead of prefixes.
In x86, for example, validate_user_key consumes 8 ms after boot-up, which turns
out derive_key_aes() consumed most of the time to load specific crypto module.
After such the cold miss, it shows almost zero latencies, which treats as a
negligible overhead.
Note that request_key() detects wrong prefix in prior to derive_key_aes() even.

Cc: Ted Tso <tytso@mit.edu>
Cc: stable@vger.kernel.org # v4.6
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/keyinfo.c      | 120 ++++++++++++++++++++++++++++++-----------------
 fs/f2fs/f2fs.h           |   8 ++++
 fs/f2fs/super.c          |  13 +++++
 include/linux/fscrypto.h |   1 +
 4 files changed, 98 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 06f5aa478bf2..1ac263eddc4e 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -78,6 +78,67 @@ out:
 	return res;
 }
 
+static int validate_user_key(struct fscrypt_info *crypt_info,
+			struct fscrypt_context *ctx, u8 *raw_key,
+			u8 *prefix, int prefix_size)
+{
+	u8 *full_key_descriptor;
+	struct key *keyring_key;
+	struct fscrypt_key *master_key;
+	const struct user_key_payload *ukp;
+	int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
+	int res;
+
+	full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
+	if (!full_key_descriptor)
+		return -ENOMEM;
+
+	memcpy(full_key_descriptor, prefix, prefix_size);
+	sprintf(full_key_descriptor + prefix_size,
+			"%*phN", FS_KEY_DESCRIPTOR_SIZE,
+			ctx->master_key_descriptor);
+	full_key_descriptor[full_key_len - 1] = '\0';
+	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+	kfree(full_key_descriptor);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	if (keyring_key->type != &key_type_logon) {
+		printk_once(KERN_WARNING
+				"%s: key type must be logon\n", __func__);
+		res = -ENOKEY;
+		goto out;
+	}
+	down_read(&keyring_key->sem);
+	ukp = user_key_payload(keyring_key);
+	if (ukp->datalen != sizeof(struct fscrypt_key)) {
+		res = -EINVAL;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	master_key = (struct fscrypt_key *)ukp->data;
+	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+		printk_once(KERN_WARNING
+				"%s: key size incorrect: %d\n",
+				__func__, master_key->size);
+		res = -ENOKEY;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+	up_read(&keyring_key->sem);
+	if (res)
+		goto out;
+
+	crypt_info->ci_keyring_key = keyring_key;
+	return 0;
+out:
+	key_put(keyring_key);
+	return res;
+}
+
 static void put_crypt_info(struct fscrypt_info *ci)
 {
 	if (!ci)
@@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 int get_crypt_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
-	u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
-				(FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
-	struct key *keyring_key = NULL;
-	struct fscrypt_key *master_key;
 	struct fscrypt_context ctx;
-	const struct user_key_payload *ukp;
 	struct crypto_skcipher *ctfm;
 	const char *cipher_str;
 	u8 raw_key[FS_MAX_KEY_SIZE];
@@ -167,48 +223,24 @@ retry:
 		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
 		goto got_key;
 	}
-	memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
-					FS_KEY_DESC_PREFIX_SIZE);
-	sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
-					"%*phN", FS_KEY_DESCRIPTOR_SIZE,
-					ctx.master_key_descriptor);
-	full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
-					(2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	if (IS_ERR(keyring_key)) {
-		res = PTR_ERR(keyring_key);
-		keyring_key = NULL;
-		goto out;
-	}
-	crypt_info->ci_keyring_key = keyring_key;
-	if (keyring_key->type != &key_type_logon) {
-		printk_once(KERN_WARNING
-				"%s: key type must be logon\n", __func__);
-		res = -ENOKEY;
-		goto out;
-	}
-	down_read(&keyring_key->sem);
-	ukp = user_key_payload(keyring_key);
-	if (ukp->datalen != sizeof(struct fscrypt_key)) {
-		res = -EINVAL;
-		up_read(&keyring_key->sem);
-		goto out;
-	}
-	master_key = (struct fscrypt_key *)ukp->data;
-	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
 
-	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
-		printk_once(KERN_WARNING
-				"%s: key size incorrect: %d\n",
-				__func__, master_key->size);
-		res = -ENOKEY;
-		up_read(&keyring_key->sem);
+	res = validate_user_key(crypt_info, &ctx, raw_key,
+			FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+	if (res && inode->i_sb->s_cop->key_prefix) {
+		u8 *prefix = NULL;
+		int prefix_size, res2;
+
+		prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
+		res2 = validate_user_key(crypt_info, &ctx, raw_key,
+							prefix, prefix_size);
+		if (res2) {
+			if (res2 == -ENOKEY)
+				res = -ENOKEY;
+			goto out;
+		}
+	} else if (res) {
 		goto out;
 	}
-	res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
-	up_read(&keyring_key->sem);
-	if (res)
-		goto out;
 got_key:
 	ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
 	if (!ctfm || IS_ERR(ctfm)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ccf8bf4debfc..dbd277eb9da7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -711,6 +711,10 @@ enum {
 	MAX_TIME,
 };
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+#endif
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -718,6 +722,10 @@ struct f2fs_sb_info {
 	int valid_super_block;			/* valid super block no */
 	int s_flag;				/* flags for sbi */
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
+	u8 key_prefix_size;
+#endif
 	/* for node-related operations */
 	struct f2fs_nm_info *nm_info;		/* node manager */
 	struct inode *node_inode;		/* cache node blocks */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8a28f799a24f..28c8992da6f6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -964,6 +964,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 				ctx, len, NULL);
 }
 
+static int f2fs_key_prefix(struct inode *inode, u8 **key)
+{
+	*key = F2FS_I_SB(inode)->key_prefix;
+	return F2FS_I_SB(inode)->key_prefix_size;
+}
+
 static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
@@ -980,6 +986,7 @@ static unsigned f2fs_max_namelen(struct inode *inode)
 
 static struct fscrypt_operations f2fs_cryptops = {
 	.get_context	= f2fs_get_context,
+	.key_prefix	= f2fs_key_prefix,
 	.set_context	= f2fs_set_context,
 	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
@@ -1305,6 +1312,12 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
+				F2FS_KEY_DESC_PREFIX_SIZE);
+	sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
+#endif
 }
 
 /*
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 6027f6bbb061..cfa6cde25f8e 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -175,6 +175,7 @@ struct fscrypt_name {
  */
 struct fscrypt_operations {
 	int (*get_context)(struct inode *, void *, size_t);
+	int (*key_prefix)(struct inode *, u8 **);
 	int (*prepare_context)(struct inode *);
 	int (*set_context)(struct inode *, const void *, size_t, void *);
 	int (*dummy_context)(struct inode *);
-- 
cgit v1.2.3


From 824815c4cd13e384ef5a4be725ec4b06e4ad2c2a Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 31 Mar 2016 16:23:51 -0400
Subject: reservation: add reservation_object_get_excl_rcu()

In the atomic modesetting path, each driver simply wants to grab a ref
to the exclusive fence from a reservation object to store in the incoming
drm_plane_state, without doing the whole RCU dance.  Since each driver
will need to do this, lets make a helper.

v2: rename to _rcu instead of _unlocked to be more consistent

Signed-off-by: Rob Clark <robdclark@gmail.com>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 include/linux/reservation.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 5a0b64cf68b4..49d057655d62 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -120,6 +120,24 @@ reservation_object_get_excl(struct reservation_object *obj)
 					 reservation_object_held(obj));
 }
 
+static inline struct fence *
+reservation_object_get_excl_rcu(struct reservation_object *obj)
+{
+	struct fence *fence;
+	unsigned seq;
+retry:
+	seq = read_seqcount_begin(&obj->seq);
+	rcu_read_lock();
+	fence = rcu_dereference(obj->fence_excl);
+	if (read_seqcount_retry(&obj->seq, seq)) {
+		rcu_read_unlock();
+		goto retry;
+	}
+	fence = fence_get(fence);
+	rcu_read_unlock();
+	return fence;
+}
+
 int reservation_object_reserve_shared(struct reservation_object *obj);
 void reservation_object_add_shared_fence(struct reservation_object *obj,
 					 struct fence *fence);
-- 
cgit v1.2.3


From 0e10d549f6eebd0a26bf075309b6fb947f4c1cb2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 15 Apr 2016 17:44:18 +0300
Subject: mfd: wm8400-core: Delete wm8400_reg_read()

There was a static checker warning in wm8400_reg_read() because we were
returning u16 and that can't hold the negative error codes.  The
function isn't used, so let's just delete it.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/wm8400-core.c          | 21 ---------------------
 include/linux/mfd/wm8400-private.h |  1 -
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c
index 9fd823049f90..81667762c3ad 100644
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -35,27 +35,6 @@ static bool wm8400_volatile(struct device *dev, unsigned int reg)
 	}
 }
 
-/**
- * wm8400_reg_read - Single register read
- *
- * @wm8400: Pointer to wm8400 control structure
- * @reg:    Register to read
- *
- * @return  Read value
- */
-u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg)
-{
-	unsigned int val;
-	int ret;
-
-	ret = regmap_read(wm8400->regmap, reg, &val);
-	if (ret < 0)
-		return ret;
-
-	return val;
-}
-EXPORT_SYMBOL_GPL(wm8400_reg_read);
-
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data)
 {
 	return regmap_bulk_read(wm8400->regmap, reg, data, count);
diff --git a/include/linux/mfd/wm8400-private.h b/include/linux/mfd/wm8400-private.h
index 2de565b94d0c..4ee908f5b834 100644
--- a/include/linux/mfd/wm8400-private.h
+++ b/include/linux/mfd/wm8400-private.h
@@ -923,7 +923,6 @@ struct wm8400 {
 #define WM8400_LINE_CMP_VTHD_SHIFT                   0  /* LINE_CMP_VTHD - [3:0] */
 #define WM8400_LINE_CMP_VTHD_WIDTH                   4  /* LINE_CMP_VTHD - [3:0] */
 
-u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg);
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data);
 
 static inline int wm8400_set_bits(struct wm8400 *wm8400, u8 reg,
-- 
cgit v1.2.3


From fe238e601d2519f259103ab65caea3b077ed7b39 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Fri, 1 Apr 2016 13:45:09 -0400
Subject: NFS: Save struct inode * inside nfs_commit_info to clarify usage of
 i_lock

Commit ea2cf22 created nfs_commit_info and saved &inode->i_lock inside
this NFS specific structure.  This obscures the usage of i_lock.
Instead, save struct inode * so later it's clear the spinlock taken is
i_lock.

Should be no functional change.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/direct.c                        | 10 +++++-----
 fs/nfs/filelayout/filelayout.c         |  4 ++--
 fs/nfs/flexfilelayout/flexfilelayout.c |  4 ++--
 fs/nfs/pnfs_nfs.c                      | 32 ++++++++++++++++----------------
 fs/nfs/write.c                         | 16 ++++++++--------
 include/linux/nfs_xdr.h                |  2 +-
 6 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f79d98ae4c10..7f03163b5364 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -278,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 			      struct nfs_direct_req *dreq)
 {
-	cinfo->lock = &dreq->inode->i_lock;
+	cinfo->inode = dreq->inode;
 	cinfo->mds = &dreq->mds_cinfo;
 	cinfo->ds = &dreq->ds_cinfo;
 	cinfo->dreq = dreq;
@@ -635,13 +635,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
 				  struct list_head *list,
 				  struct nfs_commit_info *cinfo)
 {
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 #ifdef CONFIG_NFS_V4_1
 	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
 		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 #endif
 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 }
 
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -676,13 +676,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 		if (!nfs_pageio_add_request(&desc, req)) {
 			nfs_list_remove_request(req);
 			nfs_list_add_request(req, &failed);
-			spin_lock(cinfo.lock);
+			spin_lock(&cinfo.inode->i_lock);
 			dreq->flags = 0;
 			if (desc.pg_error < 0)
 				dreq->error = desc.pg_error;
 			else
 				dreq->error = -EIO;
-			spin_unlock(cinfo.lock);
+			spin_unlock(&cinfo.inode->i_lock);
 		}
 		nfs_release_request(req);
 	}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3384dc8e6683..3e50057eeadf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 		buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
 	}
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	if (cinfo->ds->nbuckets >= size)
 		goto out;
 	for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 	swap(cinfo->ds->buckets, buckets);
 	cinfo->ds->nbuckets = size;
 out:
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	kfree(buckets);
 	return 0;
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0cb1abd535e3..3b398f7b4637 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -745,7 +745,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 	else {
 		int i;
 
-		spin_lock(cinfo->lock);
+		spin_lock(&cinfo->inode->i_lock);
 		if (cinfo->ds->nbuckets != 0)
 			kfree(buckets);
 		else {
@@ -759,7 +759,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 					NFS_INVALID_STABLE_HOW;
 			}
 		}
-		spin_unlock(cinfo->lock);
+		spin_unlock(&cinfo->inode->i_lock);
 		return 0;
 	}
 }
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 4aaed890048f..d2a7c9f7aa94 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this must be called holding the inode (/cinfo) lock
+ * Note this must be called holding i_lock
  */
 void
 pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(cinfo->lock))
+		if (cond_resched_lock(&cinfo->inode->i_lock))
 			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	struct list_head *dst = &bucket->committing;
 	int ret;
 
-	lockdep_assert_held(cinfo->lock);
+	lockdep_assert_held(&cinfo->inode->i_lock);
 	ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
 {
 	int i, rv = 0, cnt;
 
-	lockdep_assert_held(cinfo->lock);
+	lockdep_assert_held(&cinfo->inode->i_lock);
 	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
 		cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
 						       cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
 	struct pnfs_layout_segment *freeme;
 	int i;
 
-	lockdep_assert_held(cinfo->lock);
+	lockdep_assert_held(&cinfo->inode->i_lock);
 restart:
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 		if (pnfs_generic_transfer_commit_list(&b->written, dst,
 						      cinfo, 0)) {
 			freeme = b->wlseg;
 			b->wlseg = NULL;
-			spin_unlock(cinfo->lock);
+			spin_unlock(&cinfo->inode->i_lock);
 			pnfs_put_lseg(freeme);
-			spin_lock(cinfo->lock);
+			spin_lock(&cinfo->inode->i_lock);
 			goto restart;
 		}
 	}
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
 	LIST_HEAD(pages);
 	int i;
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	for (i = idx; i < fl_cinfo->nbuckets; i++) {
 		bucket = &fl_cinfo->buckets[i];
 		if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
 		freeme = bucket->clseg;
 		bucket->clseg = NULL;
 		list_splice_init(&bucket->committing, &pages);
-		spin_unlock(cinfo->lock);
+		spin_unlock(&cinfo->inode->i_lock);
 		nfs_retry_commit(&pages, freeme, cinfo, i);
 		pnfs_put_lseg(freeme);
-		spin_lock(cinfo->lock);
+		spin_lock(&cinfo->inode->i_lock);
 	}
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 }
 
 static unsigned int
@@ -238,11 +238,11 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
 	struct pnfs_commit_bucket *bucket;
 
 	bucket = &cinfo->ds->buckets[data->ds_commit_index];
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	list_splice_init(&bucket->committing, pages);
 	data->lseg = bucket->clseg;
 	bucket->clseg = NULL;
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 
 }
 
@@ -874,12 +874,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	struct list_head *list;
 	struct pnfs_commit_bucket *buckets;
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	buckets = cinfo->ds->buckets;
 	list = &buckets[ds_commit_idx].written;
 	if (list_empty(list)) {
 		if (!pnfs_is_valid_lseg(lseg)) {
-			spin_unlock(cinfo->lock);
+			spin_unlock(&cinfo->inode->i_lock);
 			cinfo->completion_ops->resched_write(cinfo, req);
 			return;
 		}
@@ -896,7 +896,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	cinfo->ds->nwritten++;
 
 	nfs_request_add_commit_list_locked(req, list, cinfo);
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5f4fd53e5764..9283a96b9fb8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must hold the cinfo->lock, and the nfs_page lock.
+ * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
  */
 void
 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,9 +832,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 void
 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -864,7 +864,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode)
 {
-	cinfo->lock = &inode->i_lock;
+	cinfo->inode = inode;
 	cinfo->mds = &NFS_I(inode)->commit_info;
 	cinfo->ds = pnfs_get_ds_info(inode);
 	cinfo->dreq = NULL;
@@ -967,7 +967,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 	return cinfo->mds->ncommit;
 }
 
-/* cinfo->lock held by caller */
+/* cinfo->inode->i_lock held by caller */
 int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
@@ -979,7 +979,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(cinfo->lock))
+		if (cond_resched_lock(&cinfo->inode->i_lock))
 			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
@@ -1005,7 +1005,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
 	int ret = 0;
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	if (cinfo->mds->ncommit > 0) {
 		const int max = INT_MAX;
 
@@ -1013,7 +1013,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 					   cinfo, max);
 		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
 	}
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	return ret;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d320906cf13e..cb9982d8f38f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1431,7 +1431,7 @@ struct nfs_commit_completion_ops {
 };
 
 struct nfs_commit_info {
-	spinlock_t			*lock;	/* inode->i_lock */
+	struct inode 			*inode;	/* Needed for inode->i_lock */
 	struct nfs_mds_commit_info	*mds;
 	struct pnfs_ds_commit_info	*ds;
 	struct nfs_direct_req		*dreq;	/* O_DIRECT request */
-- 
cgit v1.2.3


From 3c6e0bc8a14cfc8e1d4ab87f46f77b070c815bf1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 21 Apr 2016 20:51:54 -0400
Subject: sunrpc: plumb gfp_t parm into crcreate operation

We need to be able to call the generic_cred creator from different
contexts. Add a gfp_t parm to the crcreate operation and to
rpcauth_lookup_credcache. For now, we just push the gfp_t parms up
one level to the *_lookup_cred functions.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h    | 4 ++--
 net/sunrpc/auth.c              | 4 ++--
 net/sunrpc/auth_generic.c      | 6 +++---
 net/sunrpc/auth_gss/auth_gss.c | 6 +++---
 net/sunrpc/auth_unix.c         | 6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 6a241a277249..3b616aa7e4d2 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -127,7 +127,7 @@ struct rpc_authops {
 	void			(*destroy)(struct rpc_auth *);
 
 	struct rpc_cred *	(*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
-	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int);
+	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
 	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
 	rpc_authflavor_t	(*info2flavor)(struct rpcsec_gss_info *);
 	int			(*flavor2info)(rpc_authflavor_t,
@@ -178,7 +178,7 @@ rpc_authflavor_t	rpcauth_get_pseudoflavor(rpc_authflavor_t,
 int			rpcauth_get_gssinfo(rpc_authflavor_t,
 				struct rpcsec_gss_info *);
 int			rpcauth_list_flavors(rpc_authflavor_t *, int);
-struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
+struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..e0bb30fd2ed3 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
  */
 struct rpc_cred *
 rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
-		int flags)
+		int flags, gfp_t gfp)
 {
 	LIST_HEAD(free);
 	struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	if (flags & RPCAUTH_LOOKUP_RCU)
 		return ERR_PTR(-ECHILD);
 
-	new = auth->au_ops->crcreate(auth, acred, flags);
+	new = auth->au_ops->crcreate(auth, acred, flags, gfp);
 	if (IS_ERR(new)) {
 		cred = new;
 		goto out;
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..6ed3e3df43e9 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -77,15 +77,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
 static struct rpc_cred *
 generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct generic_cred *gcred;
 
-	gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+	gcred = kmalloc(sizeof(*gcred), gfp);
 	if (gcred == NULL)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8d57..e64ae93d5b4f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
 	struct gss_cred	*cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 		__func__, from_kuid(&init_user_ns, acred->uid),
 		auth->au_flavor);
 
-	if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+	if (!(cred = kzalloc(sizeof(*cred), gfp)))
 		goto out_err;
 
 	rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c22f..9f65452b7cbc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct unx_cred	*cred;
 	unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 			from_kuid(&init_user_ns, acred->uid),
 			from_kgid(&init_user_ns, acred->gid));
 
-	if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+	if (!(cred = kmalloc(sizeof(*cred), gfp)))
 		return ERR_PTR(-ENOMEM);
 
 	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
-- 
cgit v1.2.3


From c065d229e308ede426a3608cf480c61983b36fb8 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 21 Apr 2016 20:51:55 -0400
Subject: sunrpc: add rpc_lookup_generic_cred

Add function rpc_lookup_generic_cred, which allows lookups of a generic
credential that's not current_cred().

[jlayton: add gfp_t parm]

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 1 +
 net/sunrpc/auth_generic.c   | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 3b616aa7e4d2..16bd8f8fef8c 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -167,6 +167,7 @@ void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
 struct rpc_cred *	rpc_lookup_cred_nonblock(void);
+struct rpc_cred *	rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t);
 struct rpc_cred *	rpc_lookup_machine_cred(const char *service_name);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 6ed3e3df43e9..54dd3fdead54 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
 struct rpc_cred *rpc_lookup_cred_nonblock(void)
 {
 	return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
-- 
cgit v1.2.3


From 62dbef2ae41393eba2f6853ca174130f2d09c7d3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 21 Apr 2016 20:51:56 -0400
Subject: sunrpc: add a get_rpccred_rcu inline

Sometimes we might have a RCU managed credential pointer and don't want
to use locking to handle it. Add a function that will take a reference
to the cred iff the refcount is not already zero. Callers can dereference
the pointer under the rcu_read_lock and use that function to take a
reference only if the cred is not on its way to destruction.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 16bd8f8fef8c..6f36b2bf3e05 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -206,5 +206,23 @@ struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
 	return cred;
 }
 
+/**
+ * get_rpccred_rcu - get a reference to a cred using rcu-protected pointer
+ * @cred: cred of which to take a reference
+ *
+ * In some cases, we may have a pointer to a credential to which we
+ * want to take a reference, but don't already have one. Because these
+ * objects are freed using RCU, we can access the cr_count while its
+ * on its way to destruction and only take a reference if it's not already
+ * zero.
+ */
+static inline struct rpc_cred *
+get_rpccred_rcu(struct rpc_cred *cred)
+{
+	if (atomic_inc_not_zero(&cred->cr_count))
+		return cred;
+	return NULL;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_AUTH_H */
-- 
cgit v1.2.3


From 3c3e8943ac6f36ca5d18ca61b30634fb560b4ebb Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 7 Apr 2016 18:42:04 +0100
Subject: iommu: remove unused priv field from struct iommu_ops

The priv field from iommu_ops is a hangover from the of_dma_configure
series and isn't actually used. Remove it before it has chance to
spread.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ef7a6ecd8584..8a2570443b80 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -156,7 +156,6 @@ struct iommu_dm_region {
  * @domain_get_windows: Return the number of windows for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @pgsize_bitmap: bitmap of supported page sizes
- * @priv: per-instance data private to the iommu driver
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -198,7 +197,6 @@ struct iommu_ops {
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 
 	unsigned long pgsize_bitmap;
-	void *priv;
 };
 
 #define IOMMU_GROUP_NOTIFY_ADD_DEVICE		1 /* Device added */
-- 
cgit v1.2.3


From 53c92d793395fdab9edbd2f79b084bb6b2e6ae79 Mon Sep 17 00:00:00 2001
From: Robin Murphy <Robin.Murphy@arm.com>
Date: Thu, 7 Apr 2016 18:42:05 +0100
Subject: iommu: of: enforce const-ness of struct iommu_ops

As a set of driver-provided callbacks and static data, there is no
compelling reason for struct iommu_ops to be mutable in core code, so
enforce const-ness throughout.

Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm/include/asm/dma-mapping.h   |  2 +-
 arch/arm/mm/dma-mapping.c            |  6 +++---
 arch/arm64/include/asm/dma-mapping.h |  2 +-
 arch/arm64/mm/dma-mapping.c          |  4 ++--
 drivers/iommu/of_iommu.c             | 14 +++++++-------
 drivers/of/device.c                  |  2 +-
 include/linux/dma-mapping.h          |  2 +-
 include/linux/of_iommu.h             |  8 ++++----
 8 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 6ad1ceda62a5..02283eb2f5b2 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -118,7 +118,7 @@ static inline unsigned long dma_max_pfn(struct device *dev)
 
 #define arch_setup_dma_ops arch_setup_dma_ops
 extern void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			       struct iommu_ops *iommu, bool coherent);
+			       const struct iommu_ops *iommu, bool coherent);
 
 #define arch_teardown_dma_ops arch_teardown_dma_ops
 extern void arch_teardown_dma_ops(struct device *dev);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c941e93048ad..5c2ca062c3fa 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -2215,7 +2215,7 @@ static struct dma_map_ops *arm_get_iommu_dma_map_ops(bool coherent)
 }
 
 static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    struct iommu_ops *iommu)
+				    const struct iommu_ops *iommu)
 {
 	struct dma_iommu_mapping *mapping;
 
@@ -2253,7 +2253,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev)
 #else
 
 static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    struct iommu_ops *iommu)
+				    const struct iommu_ops *iommu)
 {
 	return false;
 }
@@ -2270,7 +2270,7 @@ static struct dma_map_ops *arm_get_dma_map_ops(bool coherent)
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			struct iommu_ops *iommu, bool coherent)
+			const struct iommu_ops *iommu, bool coherent)
 {
 	struct dma_map_ops *dma_ops;
 
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index ba437f090a74..7dbea6c070ec 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -48,7 +48,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			struct iommu_ops *iommu, bool coherent);
+			const struct iommu_ops *iommu, bool coherent);
 #define arch_setup_dma_ops	arch_setup_dma_ops
 
 #ifdef CONFIG_IOMMU_DMA
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a6e757cbab77..5d36907f9b12 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -979,13 +979,13 @@ void arch_teardown_dma_ops(struct device *dev)
 #else
 
 static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				  struct iommu_ops *iommu)
+				  const struct iommu_ops *iommu)
 { }
 
 #endif  /* CONFIG_IOMMU_DMA */
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			struct iommu_ops *iommu, bool coherent)
+			const struct iommu_ops *iommu, bool coherent)
 {
 	if (!dev->archdata.dma_ops)
 		dev->archdata.dma_ops = &swiotlb_dma_ops;
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 5fea665af99d..af499aea0a1a 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -98,12 +98,12 @@ EXPORT_SYMBOL_GPL(of_get_dma_window);
 struct of_iommu_node {
 	struct list_head list;
 	struct device_node *np;
-	struct iommu_ops *ops;
+	const struct iommu_ops *ops;
 };
 static LIST_HEAD(of_iommu_list);
 static DEFINE_SPINLOCK(of_iommu_lock);
 
-void of_iommu_set_ops(struct device_node *np, struct iommu_ops *ops)
+void of_iommu_set_ops(struct device_node *np, const struct iommu_ops *ops)
 {
 	struct of_iommu_node *iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 
@@ -119,10 +119,10 @@ void of_iommu_set_ops(struct device_node *np, struct iommu_ops *ops)
 	spin_unlock(&of_iommu_lock);
 }
 
-struct iommu_ops *of_iommu_get_ops(struct device_node *np)
+const struct iommu_ops *of_iommu_get_ops(struct device_node *np)
 {
 	struct of_iommu_node *node;
-	struct iommu_ops *ops = NULL;
+	const struct iommu_ops *ops = NULL;
 
 	spin_lock(&of_iommu_lock);
 	list_for_each_entry(node, &of_iommu_list, list)
@@ -134,12 +134,12 @@ struct iommu_ops *of_iommu_get_ops(struct device_node *np)
 	return ops;
 }
 
-struct iommu_ops *of_iommu_configure(struct device *dev,
-				     struct device_node *master_np)
+const struct iommu_ops *of_iommu_configure(struct device *dev,
+					   struct device_node *master_np)
 {
 	struct of_phandle_args iommu_spec;
 	struct device_node *np;
-	struct iommu_ops *ops = NULL;
+	const struct iommu_ops *ops = NULL;
 	int idx = 0;
 
 	/*
diff --git a/drivers/of/device.c b/drivers/of/device.c
index e5f47cec75f3..fd5cfad7c403 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -88,7 +88,7 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 	int ret;
 	bool coherent;
 	unsigned long offset;
-	struct iommu_ops *iommu;
+	const struct iommu_ops *iommu;
 
 	/*
 	 * Set default coherent_dma_mask to 32 bit.  Drivers are expected to
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 9ea9aba28049..71c1b215ef66 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -514,7 +514,7 @@ extern u64 dma_get_required_mask(struct device *dev);
 
 #ifndef arch_setup_dma_ops
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-				      u64 size, struct iommu_ops *iommu,
+				      u64 size, const struct iommu_ops *iommu,
 				      bool coherent) { }
 #endif
 
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index ffbe4707d4aa..bd02b44902d0 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -12,7 +12,7 @@ extern int of_get_dma_window(struct device_node *dn, const char *prefix,
 			     size_t *size);
 
 extern void of_iommu_init(void);
-extern struct iommu_ops *of_iommu_configure(struct device *dev,
+extern const struct iommu_ops *of_iommu_configure(struct device *dev,
 					struct device_node *master_np);
 
 #else
@@ -25,7 +25,7 @@ static inline int of_get_dma_window(struct device_node *dn, const char *prefix,
 }
 
 static inline void of_iommu_init(void) { }
-static inline struct iommu_ops *of_iommu_configure(struct device *dev,
+static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 					 struct device_node *master_np)
 {
 	return NULL;
@@ -33,8 +33,8 @@ static inline struct iommu_ops *of_iommu_configure(struct device *dev,
 
 #endif	/* CONFIG_OF_IOMMU */
 
-void of_iommu_set_ops(struct device_node *np, struct iommu_ops *ops);
-struct iommu_ops *of_iommu_get_ops(struct device_node *np);
+void of_iommu_set_ops(struct device_node *np, const struct iommu_ops *ops);
+const struct iommu_ops *of_iommu_get_ops(struct device_node *np);
 
 extern struct of_device_id __iommu_of_table;
 
-- 
cgit v1.2.3


From d16e0faab911cc0e100a1e8e93635b432566608e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 7 Apr 2016 18:42:06 +0100
Subject: iommu: Allow selecting page sizes per domain

Many IOMMUs support multiple page table formats, meaning that any given
domain may only support a subset of the hardware page sizes presented in
iommu_ops->pgsize_bitmap. There are also certain use-cases where the
creator of a domain may want to control which page sizes are used, for
example to force the use of hugepage mappings to reduce pagetable walk
depth.

To this end, add a per-domain pgsize_bitmap to represent the subset of
page sizes actually in use, to make it possible for domains with
different requirements to coexist.

Signed-off-by: Will Deacon <will.deacon@arm.com>
[rm: hijacked and rebased original patch with new commit message]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dma-iommu.c       |  2 +-
 drivers/iommu/iommu.c           | 22 ++++++++++++----------
 drivers/iommu/mtk_iommu.c       |  2 +-
 drivers/vfio/vfio_iommu_type1.c |  2 +-
 include/linux/iommu.h           |  3 ++-
 5 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 886cb3a78326..99432999b52f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -94,7 +94,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size
 		return -ENODEV;
 
 	/* Use the smallest supported page size for IOVA granularity */
-	order = __ffs(domain->ops->pgsize_bitmap);
+	order = __ffs(domain->pgsize_bitmap);
 	base_pfn = max_t(unsigned long, 1, base >> order);
 	end_pfn = (base + size - 1) >> order;
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b9df1411c894..ab4d014e3687 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -337,9 +337,9 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 	if (!domain || domain->type != IOMMU_DOMAIN_DMA)
 		return 0;
 
-	BUG_ON(!domain->ops->pgsize_bitmap);
+	BUG_ON(!domain->pgsize_bitmap);
 
-	pg_size = 1UL << __ffs(domain->ops->pgsize_bitmap);
+	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
 	INIT_LIST_HEAD(&mappings);
 
 	iommu_get_dm_regions(dev, &mappings);
@@ -1073,6 +1073,8 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 
 	domain->ops  = bus->iommu_ops;
 	domain->type = type;
+	/* Assume all sizes by default; the driver may override this later */
+	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
 
 	return domain;
 }
@@ -1297,7 +1299,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
 	pgsize = (1UL << (pgsize_idx + 1)) - 1;
 
 	/* throw away page sizes not supported by the hardware */
-	pgsize &= domain->ops->pgsize_bitmap;
+	pgsize &= domain->pgsize_bitmap;
 
 	/* make sure we're still sane */
 	BUG_ON(!pgsize);
@@ -1319,14 +1321,14 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	int ret = 0;
 
 	if (unlikely(domain->ops->map == NULL ||
-		     domain->ops->pgsize_bitmap == 0UL))
+		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
 		return -EINVAL;
 
 	/* find out the minimum page size supported */
-	min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
 
 	/*
 	 * both the virtual address and the physical one, as well as
@@ -1373,14 +1375,14 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 	unsigned long orig_iova = iova;
 
 	if (unlikely(domain->ops->unmap == NULL ||
-		     domain->ops->pgsize_bitmap == 0UL))
+		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
 		return -EINVAL;
 
 	/* find out the minimum page size supported */
-	min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
 
 	/*
 	 * The virtual address, as well as the size of the mapping, must be
@@ -1426,10 +1428,10 @@ size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 	unsigned int i, min_pagesz;
 	int ret;
 
-	if (unlikely(domain->ops->pgsize_bitmap == 0UL))
+	if (unlikely(domain->pgsize_bitmap == 0UL))
 		return 0;
 
-	min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
 
 	for_each_sg(sg, s, nents, i) {
 		phys_addr_t phys = page_to_phys(sg_page(s)) + s->offset;
@@ -1510,7 +1512,7 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 		break;
 	case DOMAIN_ATTR_PAGING:
 		paging  = data;
-		*paging = (domain->ops->pgsize_bitmap != 0UL);
+		*paging = (domain->pgsize_bitmap != 0UL);
 		break;
 	case DOMAIN_ATTR_WINDOWS:
 		count = data;
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 929a66a81b2b..e6b25276cfec 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -264,7 +264,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_data *data)
 	}
 
 	/* Update our support page sizes bitmap */
-	mtk_iommu_ops.pgsize_bitmap = dom->cfg.pgsize_bitmap;
+	dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap;
 
 	writel(data->m4u_dom->cfg.arm_v7s_cfg.ttbr[0],
 	       data->base + REG_MMU_PT_BASE_ADDR);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e93cedb..15a65823aad9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -407,7 +407,7 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 
 	mutex_lock(&iommu->lock);
 	list_for_each_entry(domain, &iommu->domain_list, next)
-		bitmap &= domain->domain->ops->pgsize_bitmap;
+		bitmap &= domain->domain->pgsize_bitmap;
 	mutex_unlock(&iommu->lock);
 
 	/*
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8a2570443b80..7811294bc0f7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -78,6 +78,7 @@ struct iommu_domain_geometry {
 struct iommu_domain {
 	unsigned type;
 	const struct iommu_ops *ops;
+	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	iommu_fault_handler_t handler;
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
@@ -155,7 +156,7 @@ struct iommu_dm_region {
  * @domain_set_windows: Set the number of windows for a domain
  * @domain_get_windows: Return the number of windows for a domain
  * @of_xlate: add OF master IDs to iommu grouping
- * @pgsize_bitmap: bitmap of supported page sizes
+ * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
-- 
cgit v1.2.3


From 3b6b7e19e31a816ee02a8d4372cbea9ad7db3784 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 13 Apr 2016 17:29:10 +0100
Subject: iommu/dma: Finish optimising higher-order allocations

Now that we know exactly which page sizes our caller wants to use in the
given domain, we can restrict higher-order allocation attempts to just
those sizes, if any, and avoid wasting any time or effort on other sizes
which offer no benefit. In the same vein, this also lets us accommodate
a minimum order greater than 0 for special cases.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Tested-by: Yong Wu <yong.wu@mediatek.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c |  4 +--
 drivers/iommu/dma-iommu.c   | 60 +++++++++++++++++++++++++++++----------------
 include/linux/dma-iommu.h   |  4 +--
 3 files changed, 43 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 5d36907f9b12..41d19a0fc9c0 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -562,8 +562,8 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 		struct page **pages;
 		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
 
-		pages = iommu_dma_alloc(dev, iosize, gfp, ioprot, handle,
-					flush_page);
+		pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
+					handle, flush_page);
 		if (!pages)
 			return NULL;
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 99432999b52f..ea5a9ebf0f78 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -190,11 +190,15 @@ static void __iommu_dma_free_pages(struct page **pages, int count)
 	kvfree(pages);
 }
 
-static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
+static struct page **__iommu_dma_alloc_pages(unsigned int count,
+		unsigned long order_mask, gfp_t gfp)
 {
 	struct page **pages;
 	unsigned int i = 0, array_size = count * sizeof(*pages);
-	unsigned int order = MAX_ORDER;
+
+	order_mask &= (2U << MAX_ORDER) - 1;
+	if (!order_mask)
+		return NULL;
 
 	if (array_size <= PAGE_SIZE)
 		pages = kzalloc(array_size, GFP_KERNEL);
@@ -208,36 +212,38 @@ static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
 
 	while (count) {
 		struct page *page = NULL;
-		int j;
+		unsigned int order_size;
 
 		/*
 		 * Higher-order allocations are a convenience rather
 		 * than a necessity, hence using __GFP_NORETRY until
-		 * falling back to single-page allocations.
+		 * falling back to minimum-order allocations.
 		 */
-		for (order = min_t(unsigned int, order, __fls(count));
-		     order > 0; order--) {
-			page = alloc_pages(gfp | __GFP_NORETRY, order);
+		for (order_mask &= (2U << __fls(count)) - 1;
+		     order_mask; order_mask &= ~order_size) {
+			unsigned int order = __fls(order_mask);
+
+			order_size = 1U << order;
+			page = alloc_pages((order_mask - order_size) ?
+					   gfp | __GFP_NORETRY : gfp, order);
 			if (!page)
 				continue;
-			if (PageCompound(page)) {
-				if (!split_huge_page(page))
-					break;
-				__free_pages(page, order);
-			} else {
+			if (!order)
+				break;
+			if (!PageCompound(page)) {
 				split_page(page, order);
 				break;
+			} else if (!split_huge_page(page)) {
+				break;
 			}
+			__free_pages(page, order);
 		}
-		if (!page)
-			page = alloc_page(gfp);
 		if (!page) {
 			__iommu_dma_free_pages(pages, i);
 			return NULL;
 		}
-		j = 1 << order;
-		count -= j;
-		while (j--)
+		count -= order_size;
+		while (order_size--)
 			pages[i++] = page++;
 	}
 	return pages;
@@ -267,6 +273,7 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  *	 attached to an iommu_dma_domain
  * @size: Size of buffer in bytes
  * @gfp: Allocation flags
+ * @attrs: DMA attributes for this allocation
  * @prot: IOMMU mapping flags
  * @handle: Out argument for allocated DMA handle
  * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
@@ -278,8 +285,8 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  * Return: Array of struct page pointers describing the buffer,
  *	   or NULL on failure.
  */
-struct page **iommu_dma_alloc(struct device *dev, size_t size,
-		gfp_t gfp, int prot, dma_addr_t *handle,
+struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
+		struct dma_attrs *attrs, int prot, dma_addr_t *handle,
 		void (*flush_page)(struct device *, const void *, phys_addr_t))
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
@@ -288,11 +295,22 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size,
 	struct page **pages;
 	struct sg_table sgt;
 	dma_addr_t dma_addr;
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
 
 	*handle = DMA_ERROR_CODE;
 
-	pages = __iommu_dma_alloc_pages(count, gfp);
+	min_size = alloc_sizes & -alloc_sizes;
+	if (min_size < PAGE_SIZE) {
+		min_size = PAGE_SIZE;
+		alloc_sizes |= PAGE_SIZE;
+	} else {
+		size = ALIGN(size, min_size);
+	}
+	if (dma_get_attr(DMA_ATTR_ALLOC_SINGLE_PAGES, attrs))
+		alloc_sizes = min_size;
+
+	count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	pages = __iommu_dma_alloc_pages(count, alloc_sizes >> PAGE_SHIFT, gfp);
 	if (!pages)
 		return NULL;
 
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index fc481037478a..8443bbb5c071 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -38,8 +38,8 @@ int dma_direction_to_prot(enum dma_data_direction dir, bool coherent);
  * These implement the bulk of the relevant DMA mapping callbacks, but require
  * the arch code to take care of attributes and cache maintenance
  */
-struct page **iommu_dma_alloc(struct device *dev, size_t size,
-		gfp_t gfp, int prot, dma_addr_t *handle,
+struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
+		struct dma_attrs *attrs, int prot, dma_addr_t *handle,
 		void (*flush_page)(struct device *, const void *, phys_addr_t));
 void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
 		dma_addr_t *handle);
-- 
cgit v1.2.3


From 327156c593600e0f08575621c2a56f311d482e7a Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Thu, 28 Apr 2016 15:28:56 +0530
Subject: mfd: max77620: Add core driver for MAX77620/MAX20024

MAX77620/MAX20024 are Power Management IC from the MAXIM.
It supports RTC, multiple GPIOs, multiple DCDC and LDOs,
watchdog, clock etc.

Add MFD drier to provides common support for accessing the
device; additional drivers is developed on respected subsystem
in order to use the functionality of the device.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Mallikarjun Kasoju <mkasoju@nvidia.com>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig          |  15 ++
 drivers/mfd/Makefile         |   1 +
 drivers/mfd/max77620.c       | 590 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/max77620.h | 346 +++++++++++++++++++++++++
 4 files changed, 952 insertions(+)
 create mode 100644 drivers/mfd/max77620.c
 create mode 100644 include/linux/mfd/max77620.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index cf7cbba61971..16d0beffc1cd 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -527,6 +527,21 @@ config MFD_MAX14577
 	  additional drivers must be enabled in order to use the functionality
 	  of the device.
 
+config MFD_MAX77620
+	bool "Maxim Semiconductor MAX77620 and MAX20024 PMIC Support"
+	depends on I2C=y
+	depends on OF
+	select MFD_CORE
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	select IRQ_DOMAIN
+	help
+	  Say yes here to add support for Maxim Semiconductor MAX77620 and
+	  MAX20024 which are Power Management IC with General purpose pins,
+	  RTC, regulators, clock generator, watchdog etc. This driver
+	  provides common support for accessing the device; additional drivers
+	  must be enabled in order to use the functionality of the device.
+
 config MFD_MAX77686
 	tristate "Maxim Semiconductor MAX77686/802 PMIC Support"
 	depends on I2C
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 5eaa6465d0a6..921a08dad9b3 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_MFD_DA9063)	+= da9063.o
 obj-$(CONFIG_MFD_DA9150)	+= da9150-core.o
 
 obj-$(CONFIG_MFD_MAX14577)	+= max14577.o
+obj-$(CONFIG_MFD_MAX77620)	+= max77620.o
 obj-$(CONFIG_MFD_MAX77686)	+= max77686.o
 obj-$(CONFIG_MFD_MAX77693)	+= max77693.o
 obj-$(CONFIG_MFD_MAX77843)	+= max77843.o
diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c
new file mode 100644
index 000000000000..199d261990be
--- /dev/null
+++ b/drivers/mfd/max77620.c
@@ -0,0 +1,590 @@
+/*
+ * Maxim MAX77620 MFD Driver
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * Author:
+ *	Laxman Dewangan <ldewangan@nvidia.com>
+ *	Chaitanya Bandi <bandik@nvidia.com>
+ *	Mallikarjun Kasoju <mkasoju@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/****************** Teminology used in driver ********************
+ * Here are some terminology used from datasheet for quick reference:
+ * Flexible Power Sequence (FPS):
+ * The Flexible Power Sequencer (FPS) allows each regulator to power up under
+ * hardware or software control. Additionally, each regulator can power on
+ * independently or among a group of other regulators with an adjustable
+ * power-up and power-down delays (sequencing). GPIO1, GPIO2, and GPIO3 can
+ * be programmed to be part of a sequence allowing external regulators to be
+ * sequenced along with internal regulators. 32KHz clock can be programmed to
+ * be part of a sequence.
+ * There is 3 FPS confguration registers and all resources are configured to
+ * any of these FPS or no FPS.
+ */
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/max77620.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+static struct resource gpio_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_TOP_GPIO),
+};
+
+static struct resource power_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_LBT_MBATLOW),
+};
+
+static struct resource rtc_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_TOP_RTC),
+};
+
+static struct resource thermal_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_LBT_TJALRM1),
+	DEFINE_RES_IRQ(MAX77620_IRQ_LBT_TJALRM2),
+};
+
+static const struct regmap_irq max77620_top_irqs[] = {
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_GLBL, 0, MAX77620_IRQ_TOP_GLBL_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_SD, 0, MAX77620_IRQ_TOP_SD_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_LDO, 0, MAX77620_IRQ_TOP_LDO_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_GPIO, 0, MAX77620_IRQ_TOP_GPIO_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_RTC, 0, MAX77620_IRQ_TOP_RTC_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_32K, 0, MAX77620_IRQ_TOP_32K_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_ONOFF, 0, MAX77620_IRQ_TOP_ONOFF_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_LBT_MBATLOW, 1, MAX77620_IRQ_LBM_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_LBT_TJALRM1, 1, MAX77620_IRQ_TJALRM1_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_LBT_TJALRM2, 1, MAX77620_IRQ_TJALRM2_MASK),
+};
+
+static const struct mfd_cell max77620_children[] = {
+	{ .name = "max77620-pinctrl", },
+	{ .name = "max77620-clock", },
+	{ .name = "max77620-pmic", },
+	{ .name = "max77620-watchdog", },
+	{
+		.name = "max77620-gpio",
+		.resources = gpio_resources,
+		.num_resources = ARRAY_SIZE(gpio_resources),
+	}, {
+		.name = "max77620-rtc",
+		.resources = rtc_resources,
+		.num_resources = ARRAY_SIZE(rtc_resources),
+	}, {
+		.name = "max77620-power",
+		.resources = power_resources,
+		.num_resources = ARRAY_SIZE(power_resources),
+	}, {
+		.name = "max77620-thermal",
+		.resources = thermal_resources,
+		.num_resources = ARRAY_SIZE(thermal_resources),
+	},
+};
+
+static const struct mfd_cell max20024_children[] = {
+	{ .name = "max20024-pinctrl", },
+	{ .name = "max77620-clock", },
+	{ .name = "max20024-pmic", },
+	{ .name = "max77620-watchdog", },
+	{
+		.name = "max77620-gpio",
+		.resources = gpio_resources,
+		.num_resources = ARRAY_SIZE(gpio_resources),
+	}, {
+		.name = "max77620-rtc",
+		.resources = rtc_resources,
+		.num_resources = ARRAY_SIZE(rtc_resources),
+	}, {
+		.name = "max20024-power",
+		.resources = power_resources,
+		.num_resources = ARRAY_SIZE(power_resources),
+	},
+};
+
+static struct regmap_irq_chip max77620_top_irq_chip = {
+	.name = "max77620-top",
+	.irqs = max77620_top_irqs,
+	.num_irqs = ARRAY_SIZE(max77620_top_irqs),
+	.num_regs = 2,
+	.status_base = MAX77620_REG_IRQTOP,
+	.mask_base = MAX77620_REG_IRQTOPM,
+};
+
+static const struct regmap_range max77620_readable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+};
+
+static const struct regmap_access_table max77620_readable_table = {
+	.yes_ranges = max77620_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(max77620_readable_ranges),
+};
+
+static const struct regmap_range max20024_readable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+	regmap_reg_range(MAX20024_REG_MAX_ADD, MAX20024_REG_MAX_ADD),
+};
+
+static const struct regmap_access_table max20024_readable_table = {
+	.yes_ranges = max20024_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(max20024_readable_ranges),
+};
+
+static const struct regmap_range max77620_writable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+};
+
+static const struct regmap_access_table max77620_writable_table = {
+	.yes_ranges = max77620_writable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(max77620_writable_ranges),
+};
+
+static const struct regmap_range max77620_cacheable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_SD0_CFG, MAX77620_REG_LDO_CFG3),
+	regmap_reg_range(MAX77620_REG_FPS_CFG0, MAX77620_REG_FPS_SD3),
+};
+
+static const struct regmap_access_table max77620_volatile_table = {
+	.no_ranges = max77620_cacheable_ranges,
+	.n_no_ranges = ARRAY_SIZE(max77620_cacheable_ranges),
+};
+
+static const struct regmap_config max77620_regmap_config = {
+	.name = "power-slave",
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MAX77620_REG_DVSSD4 + 1,
+	.cache_type = REGCACHE_RBTREE,
+	.rd_table = &max77620_readable_table,
+	.wr_table = &max77620_writable_table,
+	.volatile_table = &max77620_volatile_table,
+};
+
+static const struct regmap_config max20024_regmap_config = {
+	.name = "power-slave",
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MAX20024_REG_MAX_ADD + 1,
+	.cache_type = REGCACHE_RBTREE,
+	.rd_table = &max20024_readable_table,
+	.wr_table = &max77620_writable_table,
+	.volatile_table = &max77620_volatile_table,
+};
+
+/* max77620_get_fps_period_reg_value:  Get FPS bit field value from
+ *				       requested periods.
+ * MAX77620 supports the FPS period of 40, 80, 160, 320, 540, 1280, 2560
+ * and 5120 microseconds. MAX20024 supports the FPS period of 20, 40, 80,
+ * 160, 320, 540, 1280 and 2560 microseconds.
+ * The FPS register has 3 bits field to set the FPS period as
+ * bits		max77620		max20024
+ * 000		40			20
+ * 001		80			40
+ * :::
+*/
+static int max77620_get_fps_period_reg_value(struct max77620_chip *chip,
+					     int tperiod)
+{
+	int fps_min_period;
+	int i;
+
+	switch (chip->chip_id) {
+	case MAX20024:
+		fps_min_period = MAX20024_FPS_PERIOD_MIN_US;
+		break;
+	case MAX77620:
+		fps_min_period = MAX77620_FPS_PERIOD_MIN_US;
+	default:
+		return -EINVAL;
+	}
+
+	for (i = 0; i < 7; i++) {
+		if (fps_min_period >= tperiod)
+			return i;
+		fps_min_period *= 2;
+	}
+
+	return i;
+}
+
+/* max77620_config_fps: Configure FPS configuration registers
+ *			based on platform specific information.
+ */
+static int max77620_config_fps(struct max77620_chip *chip,
+			       struct device_node *fps_np)
+{
+	struct device *dev = chip->dev;
+	unsigned int mask = 0, config = 0;
+	u32 fps_max_period;
+	u32 param_val;
+	int tperiod, fps_id;
+	int ret;
+	char fps_name[10];
+
+	switch (chip->chip_id) {
+	case MAX20024:
+		fps_max_period = MAX20024_FPS_PERIOD_MAX_US;
+		break;
+	case MAX77620:
+		fps_max_period = MAX77620_FPS_PERIOD_MAX_US;
+	default:
+		return -EINVAL;
+	}
+
+	for (fps_id = 0; fps_id < MAX77620_FPS_COUNT; fps_id++) {
+		sprintf(fps_name, "fps%d", fps_id);
+		if (!strcmp(fps_np->name, fps_name))
+			break;
+	}
+
+	if (fps_id == MAX77620_FPS_COUNT) {
+		dev_err(dev, "FPS node name %s is not valid\n", fps_np->name);
+		return -EINVAL;
+	}
+
+	ret = of_property_read_u32(fps_np, "maxim,shutdown-fps-time-period-us",
+				   &param_val);
+	if (!ret) {
+		mask |= MAX77620_FPS_TIME_PERIOD_MASK;
+		chip->shutdown_fps_period[fps_id] = min(param_val,
+							fps_max_period);
+		tperiod = max77620_get_fps_period_reg_value(chip,
+				chip->shutdown_fps_period[fps_id]);
+		config |= tperiod << MAX77620_FPS_TIME_PERIOD_SHIFT;
+	}
+
+	ret = of_property_read_u32(fps_np, "maxim,suspend-fps-time-period-us",
+				   &param_val);
+	if (!ret)
+		chip->suspend_fps_period[fps_id] = min(param_val,
+						       fps_max_period);
+
+	ret = of_property_read_u32(fps_np, "maxim,fps-event-source",
+				   &param_val);
+	if (!ret) {
+		if (param_val > 2) {
+			dev_err(dev, "FPS%d event-source invalid\n", fps_id);
+			return -EINVAL;
+		}
+		mask |= MAX77620_FPS_EN_SRC_MASK;
+		config |= param_val << MAX77620_FPS_EN_SRC_SHIFT;
+		if (param_val == 2) {
+			mask |= MAX77620_FPS_ENFPS_SW_MASK;
+			config |= MAX77620_FPS_ENFPS_SW;
+		}
+	}
+
+	if (!chip->sleep_enable && !chip->enable_global_lpm) {
+		ret = of_property_read_u32(fps_np,
+				"maxim,device-state-on-disabled-event",
+				&param_val);
+		if (!ret) {
+			if (param_val == 0)
+				chip->sleep_enable = true;
+			else if (param_val == 1)
+				chip->enable_global_lpm = true;
+		}
+	}
+
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_FPS_CFG0 + fps_id,
+				 mask, config);
+	if (ret < 0) {
+		dev_err(dev, "Failed to update FPS CFG: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int max77620_initialise_fps(struct max77620_chip *chip)
+{
+	struct device *dev = chip->dev;
+	struct device_node *fps_np, *fps_child;
+	u8 config;
+	int fps_id;
+	int ret;
+
+	for (fps_id = 0; fps_id < MAX77620_FPS_COUNT; fps_id++) {
+		chip->shutdown_fps_period[fps_id] = -1;
+		chip->suspend_fps_period[fps_id] = -1;
+	}
+
+	fps_np = of_get_child_by_name(dev->of_node, "fps");
+	if (!fps_np)
+		goto skip_fps;
+
+	for_each_child_of_node(fps_np, fps_child) {
+		ret = max77620_config_fps(chip, fps_child);
+		if (ret < 0)
+			return ret;
+	}
+
+	config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0;
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_SLP_LPM_MSK, config);
+	if (ret < 0) {
+		dev_err(dev, "Failed to update SLP_LPM: %d\n", ret);
+		return ret;
+	}
+
+skip_fps:
+	/* Enable wake on EN0 pin */
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_WK_EN0,
+				 MAX77620_ONOFFCNFG2_WK_EN0);
+	if (ret < 0) {
+		dev_err(dev, "Failed to update WK_EN0: %d\n", ret);
+		return ret;
+	}
+
+	/* For MAX20024, SLPEN will be POR reset if CLRSE is b11 */
+	if ((chip->chip_id == MAX20024) && chip->sleep_enable) {
+		config = MAX77620_ONOFFCNFG1_SLPEN | MAX20024_ONOFFCNFG1_CLRSE;
+		ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG1,
+					 config, config);
+		if (ret < 0) {
+			dev_err(dev, "Failed to update SLPEN: %d\n", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int max77620_read_es_version(struct max77620_chip *chip)
+{
+	unsigned int val;
+	u8 cid_val[6];
+	int i;
+	int ret;
+
+	for (i = MAX77620_REG_CID0; i <= MAX77620_REG_CID5; i++) {
+		ret = regmap_read(chip->rmap, i, &val);
+		if (ret < 0) {
+			dev_err(chip->dev, "Failed to read CID: %d\n", ret);
+			return ret;
+		}
+		dev_dbg(chip->dev, "CID%d: 0x%02x\n",
+			i - MAX77620_REG_CID0, val);
+		cid_val[i - MAX77620_REG_CID0] = val;
+	}
+
+	/* CID4 is OTP Version  and CID5 is ES version */
+	dev_info(chip->dev, "PMIC Version OTP:0x%02X and ES:0x%X\n",
+		 cid_val[4], MAX77620_CID5_DIDM(cid_val[5]));
+
+	return ret;
+}
+
+static int max77620_probe(struct i2c_client *client,
+			  const struct i2c_device_id *id)
+{
+	const struct regmap_config *rmap_config;
+	struct max77620_chip *chip;
+	const struct mfd_cell *mfd_cells;
+	int n_mfd_cells;
+	int ret;
+
+	chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, chip);
+	chip->dev = &client->dev;
+	chip->irq_base = -1;
+	chip->chip_irq = client->irq;
+	chip->chip_id = (enum max77620_chip_id)id->driver_data;
+
+	switch (chip->chip_id) {
+	case MAX77620:
+		mfd_cells = max77620_children;
+		n_mfd_cells = ARRAY_SIZE(max77620_children);
+		rmap_config = &max77620_regmap_config;
+		break;
+	case MAX20024:
+		mfd_cells = max20024_children;
+		n_mfd_cells = ARRAY_SIZE(max20024_children);
+		rmap_config = &max20024_regmap_config;
+		break;
+	default:
+		dev_err(chip->dev, "ChipID is invalid %d\n", chip->chip_id);
+		return -EINVAL;
+	}
+
+	chip->rmap = devm_regmap_init_i2c(client, rmap_config);
+	if (IS_ERR(chip->rmap)) {
+		ret = PTR_ERR(chip->rmap);
+		dev_err(chip->dev, "Failed to intialise regmap: %d\n", ret);
+		return ret;
+	}
+
+	ret = max77620_read_es_version(chip);
+	if (ret < 0)
+		return ret;
+
+	ret = devm_regmap_add_irq_chip(chip->dev, chip->rmap, client->irq,
+				       IRQF_ONESHOT | IRQF_SHARED,
+				       chip->irq_base, &max77620_top_irq_chip,
+				       &chip->top_irq_data);
+	if (ret < 0) {
+		dev_err(chip->dev, "Failed to add regmap irq: %d\n", ret);
+		return ret;
+	}
+
+	ret = max77620_initialise_fps(chip);
+	if (ret < 0)
+		return ret;
+
+	ret =  devm_mfd_add_devices(chip->dev, PLATFORM_DEVID_NONE,
+				    mfd_cells, n_mfd_cells, NULL, 0,
+				    regmap_irq_get_domain(chip->top_irq_data));
+	if (ret < 0) {
+		dev_err(chip->dev, "Failed to add MFD children: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int max77620_set_fps_period(struct max77620_chip *chip,
+				   int fps_id, int time_period)
+{
+	int period = max77620_get_fps_period_reg_value(chip, time_period);
+	int ret;
+
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_FPS_CFG0 + fps_id,
+				 MAX77620_FPS_TIME_PERIOD_MASK,
+				 period << MAX77620_FPS_TIME_PERIOD_SHIFT);
+	if (ret < 0) {
+		dev_err(chip->dev, "Failed to update FPS period: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int max77620_i2c_suspend(struct device *dev)
+{
+	struct max77620_chip *chip = dev_get_drvdata(dev);
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned int config;
+	int fps;
+	int ret;
+
+	for (fps = 0; fps < MAX77620_FPS_COUNT; fps++) {
+		if (chip->suspend_fps_period[fps] < 0)
+			continue;
+
+		ret = max77620_set_fps_period(chip, fps,
+					      chip->suspend_fps_period[fps]);
+		if (ret < 0)
+			return ret;
+	}
+
+	/*
+	 * For MAX20024: No need to configure SLPEN on suspend as
+	 * it will be configured on Init.
+	 */
+	if (chip->chip_id == MAX20024)
+		goto out;
+
+	config = (chip->sleep_enable) ? MAX77620_ONOFFCNFG1_SLPEN : 0;
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG1,
+				 MAX77620_ONOFFCNFG1_SLPEN,
+				 config);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure sleep in suspend: %d\n", ret);
+		return ret;
+	}
+
+	/* Disable WK_EN0 */
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_WK_EN0, 0);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure WK_EN in suspend: %d\n", ret);
+		return ret;
+	}
+
+out:
+	disable_irq(client->irq);
+
+	return 0;
+}
+
+static int max77620_i2c_resume(struct device *dev)
+{
+	struct max77620_chip *chip = dev_get_drvdata(dev);
+	struct i2c_client *client = to_i2c_client(dev);
+	int ret;
+	int fps;
+
+	for (fps = 0; fps < MAX77620_FPS_COUNT; fps++) {
+		if (chip->shutdown_fps_period[fps] < 0)
+			continue;
+
+		ret = max77620_set_fps_period(chip, fps,
+					      chip->shutdown_fps_period[fps]);
+		if (ret < 0)
+			return ret;
+	}
+
+	/*
+	 * For MAX20024: No need to configure WKEN0 on resume as
+	 * it is configured on Init.
+	 */
+	if (chip->chip_id == MAX20024)
+		goto out;
+
+	/* Enable WK_EN0 */
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_WK_EN0,
+				 MAX77620_ONOFFCNFG2_WK_EN0);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure WK_EN0 n resume: %d\n", ret);
+		return ret;
+	}
+
+out:
+	enable_irq(client->irq);
+
+	return 0;
+}
+#endif
+
+static const struct i2c_device_id max77620_id[] = {
+	{"max77620", MAX77620},
+	{"max20024", MAX20024},
+	{},
+};
+MODULE_DEVICE_TABLE(i2c, max77620_id);
+
+static const struct dev_pm_ops max77620_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(max77620_i2c_suspend, max77620_i2c_resume)
+};
+
+static struct i2c_driver max77620_driver = {
+	.driver = {
+		.name = "max77620",
+		.pm = &max77620_pm_ops,
+	},
+	.probe = max77620_probe,
+	.id_table = max77620_id,
+};
+
+module_i2c_driver(max77620_driver);
+
+MODULE_DESCRIPTION("MAX77620/MAX20024 Multi Function Device Core Driver");
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_AUTHOR("Chaitanya Bandi <bandik@nvidia.com>");
+MODULE_AUTHOR("Mallikarjun Kasoju <mkasoju@nvidia.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/max77620.h b/include/linux/mfd/max77620.h
new file mode 100644
index 000000000000..3ca0af07fc78
--- /dev/null
+++ b/include/linux/mfd/max77620.h
@@ -0,0 +1,346 @@
+/*
+ * Defining registers address and its bit definitions of MAX77620 and MAX20024
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _MFD_MAX77620_H_
+#define _MFD_MAX77620_H_
+
+#include <linux/types.h>
+
+/* GLOBAL, PMIC, GPIO, FPS, ONOFFC, CID Registers */
+#define MAX77620_REG_CNFGGLBL1			0x00
+#define MAX77620_REG_CNFGGLBL2			0x01
+#define MAX77620_REG_CNFGGLBL3			0x02
+#define MAX77620_REG_CNFG1_32K			0x03
+#define MAX77620_REG_CNFGBBC			0x04
+#define MAX77620_REG_IRQTOP			0x05
+#define MAX77620_REG_INTLBT			0x06
+#define MAX77620_REG_IRQSD			0x07
+#define MAX77620_REG_IRQ_LVL2_L0_7		0x08
+#define MAX77620_REG_IRQ_LVL2_L8		0x09
+#define MAX77620_REG_IRQ_LVL2_GPIO		0x0A
+#define MAX77620_REG_ONOFFIRQ			0x0B
+#define MAX77620_REG_NVERC			0x0C
+#define MAX77620_REG_IRQTOPM			0x0D
+#define MAX77620_REG_INTENLBT			0x0E
+#define MAX77620_REG_IRQMASKSD			0x0F
+#define MAX77620_REG_IRQ_MSK_L0_7		0x10
+#define MAX77620_REG_IRQ_MSK_L8			0x11
+#define MAX77620_REG_ONOFFIRQM			0x12
+#define MAX77620_REG_STATLBT			0x13
+#define MAX77620_REG_STATSD			0x14
+#define MAX77620_REG_ONOFFSTAT			0x15
+
+/* SD and LDO Registers */
+#define MAX77620_REG_SD0			0x16
+#define MAX77620_REG_SD1			0x17
+#define MAX77620_REG_SD2			0x18
+#define MAX77620_REG_SD3			0x19
+#define MAX77620_REG_SD4			0x1A
+#define MAX77620_REG_DVSSD0			0x1B
+#define MAX77620_REG_DVSSD1			0x1C
+#define MAX77620_REG_SD0_CFG			0x1D
+#define MAX77620_REG_SD1_CFG			0x1E
+#define MAX77620_REG_SD2_CFG			0x1F
+#define MAX77620_REG_SD3_CFG			0x20
+#define MAX77620_REG_SD4_CFG			0x21
+#define MAX77620_REG_SD_CFG2			0x22
+#define MAX77620_REG_LDO0_CFG			0x23
+#define MAX77620_REG_LDO0_CFG2			0x24
+#define MAX77620_REG_LDO1_CFG			0x25
+#define MAX77620_REG_LDO1_CFG2			0x26
+#define MAX77620_REG_LDO2_CFG			0x27
+#define MAX77620_REG_LDO2_CFG2			0x28
+#define MAX77620_REG_LDO3_CFG			0x29
+#define MAX77620_REG_LDO3_CFG2			0x2A
+#define MAX77620_REG_LDO4_CFG			0x2B
+#define MAX77620_REG_LDO4_CFG2			0x2C
+#define MAX77620_REG_LDO5_CFG			0x2D
+#define MAX77620_REG_LDO5_CFG2			0x2E
+#define MAX77620_REG_LDO6_CFG			0x2F
+#define MAX77620_REG_LDO6_CFG2			0x30
+#define MAX77620_REG_LDO7_CFG			0x31
+#define MAX77620_REG_LDO7_CFG2			0x32
+#define MAX77620_REG_LDO8_CFG			0x33
+#define MAX77620_REG_LDO8_CFG2			0x34
+#define MAX77620_REG_LDO_CFG3			0x35
+
+#define MAX77620_LDO_SLEW_RATE_MASK		0x1
+
+/* LDO Configuration 3 */
+#define MAX77620_TRACK4_MASK			BIT(5)
+#define MAX77620_TRACK4_SHIFT			5
+
+/* Voltage */
+#define MAX77620_SDX_VOLT_MASK			0xFF
+#define MAX77620_SD0_VOLT_MASK			0x3F
+#define MAX77620_SD1_VOLT_MASK			0x7F
+#define MAX77620_LDO_VOLT_MASK			0x3F
+
+#define MAX77620_REG_GPIO0			0x36
+#define MAX77620_REG_GPIO1			0x37
+#define MAX77620_REG_GPIO2			0x38
+#define MAX77620_REG_GPIO3			0x39
+#define MAX77620_REG_GPIO4			0x3A
+#define MAX77620_REG_GPIO5			0x3B
+#define MAX77620_REG_GPIO6			0x3C
+#define MAX77620_REG_GPIO7			0x3D
+#define MAX77620_REG_PUE_GPIO			0x3E
+#define MAX77620_REG_PDE_GPIO			0x3F
+#define MAX77620_REG_AME_GPIO			0x40
+#define MAX77620_REG_ONOFFCNFG1			0x41
+#define MAX77620_REG_ONOFFCNFG2			0x42
+
+/* FPS Registers */
+#define MAX77620_REG_FPS_CFG0			0x43
+#define MAX77620_REG_FPS_CFG1			0x44
+#define MAX77620_REG_FPS_CFG2			0x45
+#define MAX77620_REG_FPS_LDO0			0x46
+#define MAX77620_REG_FPS_LDO1			0x47
+#define MAX77620_REG_FPS_LDO2			0x48
+#define MAX77620_REG_FPS_LDO3			0x49
+#define MAX77620_REG_FPS_LDO4			0x4A
+#define MAX77620_REG_FPS_LDO5			0x4B
+#define MAX77620_REG_FPS_LDO6			0x4C
+#define MAX77620_REG_FPS_LDO7			0x4D
+#define MAX77620_REG_FPS_LDO8			0x4E
+#define MAX77620_REG_FPS_SD0			0x4F
+#define MAX77620_REG_FPS_SD1			0x50
+#define MAX77620_REG_FPS_SD2			0x51
+#define MAX77620_REG_FPS_SD3			0x52
+#define MAX77620_REG_FPS_SD4			0x53
+#define MAX77620_REG_FPS_NONE			0
+
+#define MAX77620_FPS_SRC_MASK			0xC0
+#define MAX77620_FPS_SRC_SHIFT			6
+#define MAX77620_FPS_PU_PERIOD_MASK		0x38
+#define MAX77620_FPS_PU_PERIOD_SHIFT		3
+#define MAX77620_FPS_PD_PERIOD_MASK		0x07
+#define MAX77620_FPS_PD_PERIOD_SHIFT		0
+#define MAX77620_FPS_TIME_PERIOD_MASK		0x38
+#define MAX77620_FPS_TIME_PERIOD_SHIFT		3
+#define MAX77620_FPS_EN_SRC_MASK		0x06
+#define MAX77620_FPS_EN_SRC_SHIFT		1
+#define MAX77620_FPS_ENFPS_SW_MASK		0x01
+#define MAX77620_FPS_ENFPS_SW			0x01
+
+/* Minimum and maximum FPS period time (in microseconds) are
+ * different for MAX77620 and Max20024.
+ */
+#define MAX77620_FPS_PERIOD_MIN_US		40
+#define MAX20024_FPS_PERIOD_MIN_US		20
+
+#define MAX77620_FPS_PERIOD_MAX_US		2560
+#define MAX20024_FPS_PERIOD_MAX_US		5120
+
+#define MAX77620_REG_FPS_GPIO1			0x54
+#define MAX77620_REG_FPS_GPIO2			0x55
+#define MAX77620_REG_FPS_GPIO3			0x56
+#define MAX77620_REG_FPS_RSO			0x57
+#define MAX77620_REG_CID0			0x58
+#define MAX77620_REG_CID1			0x59
+#define MAX77620_REG_CID2			0x5A
+#define MAX77620_REG_CID3			0x5B
+#define MAX77620_REG_CID4			0x5C
+#define MAX77620_REG_CID5			0x5D
+
+#define MAX77620_REG_DVSSD4			0x5E
+#define MAX20024_REG_MAX_ADD			0x70
+
+#define MAX77620_CID_DIDM_MASK			0xF0
+#define MAX77620_CID_DIDM_SHIFT			4
+
+/* CNCG2SD */
+#define MAX77620_SD_CNF2_ROVS_EN_SD1		BIT(1)
+#define MAX77620_SD_CNF2_ROVS_EN_SD0		BIT(2)
+
+/* Device Identification Metal */
+#define MAX77620_CID5_DIDM(n)			(((n) >> 4) & 0xF)
+/* Device Indentification OTP */
+#define MAX77620_CID5_DIDO(n)			((n) & 0xF)
+
+/* SD CNFG1 */
+#define MAX77620_SD_SR_MASK			0xC0
+#define MAX77620_SD_SR_SHIFT			6
+#define MAX77620_SD_POWER_MODE_MASK		0x30
+#define MAX77620_SD_POWER_MODE_SHIFT		4
+#define MAX77620_SD_CFG1_ADE_MASK		BIT(3)
+#define MAX77620_SD_CFG1_ADE_DISABLE		0
+#define MAX77620_SD_CFG1_ADE_ENABLE		BIT(3)
+#define MAX77620_SD_FPWM_MASK			0x04
+#define MAX77620_SD_FPWM_SHIFT			2
+#define MAX77620_SD_FSRADE_MASK			0x01
+#define MAX77620_SD_FSRADE_SHIFT		0
+#define MAX77620_SD_CFG1_FPWM_SD_MASK		BIT(2)
+#define MAX77620_SD_CFG1_FPWM_SD_SKIP		0
+#define MAX77620_SD_CFG1_FPWM_SD_FPWM		BIT(2)
+#define MAX77620_SD_CFG1_FSRADE_SD_MASK		BIT(0)
+#define MAX77620_SD_CFG1_FSRADE_SD_DISABLE	0
+#define MAX77620_SD_CFG1_FSRADE_SD_ENABLE	BIT(0)
+
+/* LDO_CNFG2 */
+#define MAX77620_LDO_POWER_MODE_MASK		0xC0
+#define MAX77620_LDO_POWER_MODE_SHIFT		6
+#define MAX77620_LDO_CFG2_ADE_MASK		BIT(1)
+#define MAX77620_LDO_CFG2_ADE_DISABLE		0
+#define MAX77620_LDO_CFG2_ADE_ENABLE		BIT(1)
+#define MAX77620_LDO_CFG2_SS_MASK		BIT(0)
+#define MAX77620_LDO_CFG2_SS_FAST		BIT(0)
+#define MAX77620_LDO_CFG2_SS_SLOW		0
+
+#define MAX77620_IRQ_TOP_GLBL_MASK		BIT(7)
+#define MAX77620_IRQ_TOP_SD_MASK		BIT(6)
+#define MAX77620_IRQ_TOP_LDO_MASK		BIT(5)
+#define MAX77620_IRQ_TOP_GPIO_MASK		BIT(4)
+#define MAX77620_IRQ_TOP_RTC_MASK		BIT(3)
+#define MAX77620_IRQ_TOP_32K_MASK		BIT(2)
+#define MAX77620_IRQ_TOP_ONOFF_MASK		BIT(1)
+
+#define MAX77620_IRQ_LBM_MASK			BIT(3)
+#define MAX77620_IRQ_TJALRM1_MASK		BIT(2)
+#define MAX77620_IRQ_TJALRM2_MASK		BIT(1)
+
+#define MAX77620_PWR_I2C_ADDR			0x3c
+#define MAX77620_RTC_I2C_ADDR			0x68
+
+#define MAX77620_CNFG_GPIO_DRV_MASK		BIT(0)
+#define MAX77620_CNFG_GPIO_DRV_PUSHPULL		BIT(0)
+#define MAX77620_CNFG_GPIO_DRV_OPENDRAIN	0
+#define MAX77620_CNFG_GPIO_DIR_MASK		BIT(1)
+#define MAX77620_CNFG_GPIO_DIR_INPUT		BIT(1)
+#define MAX77620_CNFG_GPIO_DIR_OUTPUT		0
+#define MAX77620_CNFG_GPIO_INPUT_VAL_MASK	BIT(2)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_MASK	BIT(3)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_HIGH	BIT(3)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_LOW	0
+#define MAX77620_CNFG_GPIO_INT_MASK		(0x3 << 4)
+#define MAX77620_CNFG_GPIO_INT_FALLING		BIT(4)
+#define MAX77620_CNFG_GPIO_INT_RISING		BIT(5)
+#define MAX77620_CNFG_GPIO_DBNC_MASK		(0x3 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_None		(0x0 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_8ms		(0x1 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_16ms		(0x2 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_32ms		(0x3 << 6)
+
+#define MAX77620_IRQ_LVL2_GPIO_EDGE0		BIT(0)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE1		BIT(1)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE2		BIT(2)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE3		BIT(3)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE4		BIT(4)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE5		BIT(5)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE6		BIT(6)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE7		BIT(7)
+
+#define MAX77620_CNFG1_32K_OUT0_EN		BIT(2)
+
+#define MAX77620_ONOFFCNFG1_SFT_RST		BIT(7)
+#define MAX77620_ONOFFCNFG1_MRT_MASK		0x38
+#define MAX77620_ONOFFCNFG1_MRT_SHIFT		0x3
+#define MAX77620_ONOFFCNFG1_SLPEN		BIT(2)
+#define MAX77620_ONOFFCNFG1_PWR_OFF		BIT(1)
+#define MAX20024_ONOFFCNFG1_CLRSE		0x18
+
+#define MAX77620_ONOFFCNFG2_SFT_RST_WK		BIT(7)
+#define MAX77620_ONOFFCNFG2_WD_RST_WK		BIT(6)
+#define MAX77620_ONOFFCNFG2_SLP_LPM_MSK		BIT(5)
+#define MAX77620_ONOFFCNFG2_WK_ALARM1		BIT(2)
+#define MAX77620_ONOFFCNFG2_WK_EN0		BIT(0)
+
+#define MAX77620_GLBLM_MASK			BIT(0)
+
+#define MAX77620_WDTC_MASK			0x3
+#define MAX77620_WDTOFFC			BIT(4)
+#define MAX77620_WDTSLPC			BIT(3)
+#define MAX77620_WDTEN				BIT(2)
+
+#define MAX77620_TWD_MASK			0x3
+#define MAX77620_TWD_2s				0x0
+#define MAX77620_TWD_16s			0x1
+#define MAX77620_TWD_64s			0x2
+#define MAX77620_TWD_128s			0x3
+
+#define MAX77620_CNFGGLBL1_LBDAC_EN		BIT(7)
+#define MAX77620_CNFGGLBL1_MPPLD		BIT(6)
+#define MAX77620_CNFGGLBL1_LBHYST		(BIT(5) | BIT(4))
+#define MAX77620_CNFGGLBL1_LBDAC		0x0E
+#define MAX77620_CNFGGLBL1_LBRSTEN		BIT(0)
+
+/* CNFG BBC registers */
+#define MAX77620_CNFGBBC_ENABLE			BIT(0)
+#define MAX77620_CNFGBBC_CURRENT_MASK		0x06
+#define MAX77620_CNFGBBC_CURRENT_SHIFT		1
+#define MAX77620_CNFGBBC_VOLTAGE_MASK		0x18
+#define MAX77620_CNFGBBC_VOLTAGE_SHIFT		3
+#define MAX77620_CNFGBBC_LOW_CURRENT_DISABLE	BIT(5)
+#define MAX77620_CNFGBBC_RESISTOR_MASK		0xC0
+#define MAX77620_CNFGBBC_RESISTOR_SHIFT		6
+
+#define MAX77620_FPS_COUNT			3
+
+/* Interrupts */
+enum {
+	MAX77620_IRQ_TOP_GLBL,		/* Low-Battery */
+	MAX77620_IRQ_TOP_SD,		/* SD power fail */
+	MAX77620_IRQ_TOP_LDO,		/* LDO power fail */
+	MAX77620_IRQ_TOP_GPIO,		/* TOP GPIO internal int to MAX77620 */
+	MAX77620_IRQ_TOP_RTC,		/* RTC */
+	MAX77620_IRQ_TOP_32K,		/* 32kHz oscillator */
+	MAX77620_IRQ_TOP_ONOFF,		/* ON/OFF oscillator */
+	MAX77620_IRQ_LBT_MBATLOW,	/* Thermal alarm status, > 120C */
+	MAX77620_IRQ_LBT_TJALRM1,	/* Thermal alarm status, > 120C */
+	MAX77620_IRQ_LBT_TJALRM2,	/* Thermal alarm status, > 140C */
+};
+
+/* GPIOs */
+enum {
+	MAX77620_GPIO0,
+	MAX77620_GPIO1,
+	MAX77620_GPIO2,
+	MAX77620_GPIO3,
+	MAX77620_GPIO4,
+	MAX77620_GPIO5,
+	MAX77620_GPIO6,
+	MAX77620_GPIO7,
+	MAX77620_GPIO_NR,
+};
+
+/* FPS Source */
+enum max77620_fps_src {
+	MAX77620_FPS_SRC_0,
+	MAX77620_FPS_SRC_1,
+	MAX77620_FPS_SRC_2,
+	MAX77620_FPS_SRC_NONE,
+	MAX77620_FPS_SRC_DEF,
+};
+
+enum max77620_chip_id {
+	MAX77620,
+	MAX20024,
+};
+
+struct max77620_chip {
+	struct device *dev;
+	struct regmap *rmap;
+
+	int chip_irq;
+	int irq_base;
+
+	/* chip id */
+	enum max77620_chip_id chip_id;
+
+	bool sleep_enable;
+	bool enable_global_lpm;
+	int shutdown_fps_period[MAX77620_FPS_COUNT];
+	int suspend_fps_period[MAX77620_FPS_COUNT];
+
+	struct regmap_irq_chip_data *top_irq_data;
+	struct regmap_irq_chip_data *gpio_irq_data;
+};
+
+#endif /* _MFD_MAX77620_H_ */
-- 
cgit v1.2.3


From 85ad1d13ee9b3db00615ea24b031c15e5ba14fd1 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 3 May 2016 22:22:13 -0400
Subject: md: set MD_CHANGE_PENDING in a atomic region

Some code waits for a metadata update by:

1. flagging that it is needed (MD_CHANGE_DEVS or MD_CHANGE_CLEAN)
2. setting MD_CHANGE_PENDING and waking the management thread
3. waiting for MD_CHANGE_PENDING to be cleared

If the first two are done without locking, the code in md_update_sb()
which checks if it needs to repeat might test if an update is needed
before step 1, then clear MD_CHANGE_PENDING after step 2, resulting
in the wait returning early.

So make sure all places that set MD_CHANGE_PENDING are atomicial, and
bit_clear_unless (suggested by Neil) is introduced for the purpose.

Cc: Martin Kepplinger <martink@posteo.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: <linux-kernel@vger.kernel.org>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c          | 27 ++++++++++++++-------------
 drivers/md/raid1.c       |  4 ++--
 drivers/md/raid10.c      |  8 ++++----
 drivers/md/raid5-cache.c |  4 ++--
 drivers/md/raid5.c       |  4 ++--
 include/linux/bitops.h   | 16 ++++++++++++++++
 6 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 23c6d732a374..a79462dcd5e1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2295,12 +2295,16 @@ repeat:
 	if (mddev_is_clustered(mddev)) {
 		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 			force_change = 1;
+		if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+			nospares = 1;
 		ret = md_cluster_ops->metadata_update_start(mddev);
 		/* Has someone else has updated the sb */
 		if (!does_sb_need_changing(mddev)) {
 			if (ret == 0)
 				md_cluster_ops->metadata_update_cancel(mddev);
-			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+							 BIT(MD_CHANGE_DEVS) |
+							 BIT(MD_CHANGE_CLEAN));
 			return;
 		}
 	}
@@ -2434,15 +2438,11 @@ repeat:
 	if (mddev_is_clustered(mddev) && ret == 0)
 		md_cluster_ops->metadata_update_finish(mddev);
 
-	spin_lock(&mddev->lock);
 	if (mddev->in_sync != sync_req ||
-	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+	    !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+			       BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN)))
 		/* have to write it out again */
-		spin_unlock(&mddev->lock);
 		goto repeat;
-	}
-	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-	spin_unlock(&mddev->lock);
 	wake_up(&mddev->sb_wait);
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -8147,18 +8147,18 @@ void md_do_sync(struct md_thread *thread)
 		}
 	}
  skip:
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-
 	if (mddev_is_clustered(mddev) &&
 	    ret == 0) {
 		/* set CHANGE_PENDING here since maybe another
 		 * update is needed, so other nodes are informed */
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 		md_cluster_ops->resync_finish(mddev);
-	}
+	} else
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8550,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape);
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
+	struct mddev *mddev = rdev->mddev;
 	int rv;
 	if (is_new)
 		s += rdev->new_data_offset;
@@ -8559,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 	if (rv == 0) {
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
-		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(rdev->mddev->thread);
 		return 1;
 	} else
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a7f2b9c9f8a0..c7c8cde0ab21 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 	 * if recovery is running, make sure it aborts.
 	 */
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	printk(KERN_ALERT
 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid1:%s: Operation continuing on %d devices.\n",
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84e24e648165..c7de2a53e625 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 		bio->bi_iter.bi_sector < conf->reshape_progress))) {
 		/* Need to update reshape_position in metadata */
 		mddev->reshape_position = conf->reshape_progress;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9531f5f05b93..ac51bc5ecb16 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	 * in_teardown check workaround this issue.
 	 */
 	if (!log->in_teardown) {
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4d31b235a888..8959e6dd31dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	printk(KERN_ALERT
 	       "md/raid:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid:%s: Operation continuing on %d devices.\n",
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index defeaac0745f..299e76b59fe9 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -227,6 +227,22 @@ static inline unsigned long __ffs64(u64 word)
 })
 #endif
 
+#ifndef bit_clear_unless
+#define bit_clear_unless(ptr, _clear, _test)	\
+({								\
+	const typeof(*ptr) clear = (_clear), test = (_test);	\
+	typeof(*ptr) old, new;					\
+								\
+	do {							\
+		old = ACCESS_ONCE(*ptr);			\
+		new = old & ~clear;				\
+	} while (!(old & test) &&				\
+		 cmpxchg(ptr, old, new) != old);		\
+								\
+	!(old & test);						\
+})
+#endif
+
 #ifndef find_last_bit
 /**
  * find_last_bit - find the last set bit in a memory region
-- 
cgit v1.2.3


From 661ce1f0c4a69f92ad781d8d2c205c90dd9c5833 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:45 +0200
Subject: libata/libsas: Define ATA_CMD_NCQ_NON_DATA

Define the NCQ NON DATA command and update libsas to handle it
correctly.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 1 +
 drivers/scsi/isci/request.c            | 3 ++-
 drivers/scsi/libsas/sas_ata.c          | 3 ++-
 drivers/scsi/mvsas/mv_sas.c            | 3 ++-
 drivers/scsi/pm8001/pm8001_sas.c       | 3 ++-
 include/linux/ata.h                    | 1 +
 include/trace/events/libata.h          | 1 +
 7 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index 860c9f847371..bd20c5488768 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -1632,6 +1632,7 @@ static u8 get_ata_protocol(u8 cmd, int direction)
 	case ATA_CMD_FPDMA_READ:
 	case ATA_CMD_FPDMA_RECV:
 	case ATA_CMD_FPDMA_SEND:
+	case ATA_CMD_NCQ_NON_DATA:
 	return SATA_PROTOCOL_FPDMA;
 
 	case ATA_CMD_ID_ATA:
diff --git a/drivers/scsi/isci/request.c b/drivers/scsi/isci/request.c
index 29456e097a30..b709d2b20880 100644
--- a/drivers/scsi/isci/request.c
+++ b/drivers/scsi/isci/request.c
@@ -3171,7 +3171,8 @@ static enum sci_status isci_request_stp_request_construct(struct isci_request *i
 	if (qc && (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 		   qc->tf.command == ATA_CMD_FPDMA_READ ||
 		   qc->tf.command == ATA_CMD_FPDMA_RECV ||
-		   qc->tf.command == ATA_CMD_FPDMA_SEND)) {
+		   qc->tf.command == ATA_CMD_FPDMA_SEND ||
+		   qc->tf.command == ATA_CMD_NCQ_NON_DATA)) {
 		fis->sector_count = qc->tag << 3;
 		ireq->tc->type.stp.ncq_tag = qc->tag;
 	}
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index fe1cd2691748..935c43095109 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -207,7 +207,8 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	if (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 	    qc->tf.command == ATA_CMD_FPDMA_READ ||
 	    qc->tf.command == ATA_CMD_FPDMA_RECV ||
-	    qc->tf.command == ATA_CMD_FPDMA_SEND) {
+	    qc->tf.command == ATA_CMD_FPDMA_SEND ||
+	    qc->tf.command == ATA_CMD_NCQ_NON_DATA) {
 		/* Need to zero out the tag libata assigned us */
 		qc->tf.nsect = 0;
 	}
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index db3714964c0a..5b9fcff6cd94 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -431,7 +431,8 @@ static u32 mvs_get_ncq_tag(struct sas_task *task, u32 *tag)
 		if (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 		    qc->tf.command == ATA_CMD_FPDMA_READ ||
 		    qc->tf.command == ATA_CMD_FPDMA_RECV ||
-		    qc->tf.command == ATA_CMD_FPDMA_SEND) {
+		    qc->tf.command == ATA_CMD_FPDMA_SEND ||
+		    qc->tf.command == ATA_CMD_NCQ_NON_DATA) {
 			*tag = qc->tag;
 			return 1;
 		}
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 62abd9896625..dc33dfa8f994 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -282,7 +282,8 @@ u32 pm8001_get_ncq_tag(struct sas_task *task, u32 *tag)
 		if (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 		    qc->tf.command == ATA_CMD_FPDMA_READ ||
 		    qc->tf.command == ATA_CMD_FPDMA_RECV ||
-		    qc->tf.command == ATA_CMD_FPDMA_SEND) {
+		    qc->tf.command == ATA_CMD_FPDMA_SEND ||
+		    qc->tf.command == ATA_CMD_NCQ_NON_DATA) {
 			*tag = qc->tag;
 			return 1;
 		}
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 00aebc4c83ad..b84210a28a00 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -243,6 +243,7 @@ enum {
 	ATA_CMD_WRITE_QUEUED_FUA_EXT = 0x3E,
 	ATA_CMD_FPDMA_READ	= 0x60,
 	ATA_CMD_FPDMA_WRITE	= 0x61,
+	ATA_CMD_NCQ_NON_DATA	= 0x63,
 	ATA_CMD_FPDMA_SEND	= 0x64,
 	ATA_CMD_FPDMA_RECV	= 0x65,
 	ATA_CMD_PIO_READ	= 0x20,
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 8b0fbd93082c..016860320f6f 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -39,6 +39,7 @@
 		 ata_opcode_name(ATA_CMD_WRITE_QUEUED_FUA_EXT), \
 		 ata_opcode_name(ATA_CMD_FPDMA_READ),		\
 		 ata_opcode_name(ATA_CMD_FPDMA_WRITE),		\
+		 ata_opcode_name(ATA_CMD_NCQ_NON_DATA),		\
 		 ata_opcode_name(ATA_CMD_FPDMA_SEND),		\
 		 ata_opcode_name(ATA_CMD_FPDMA_RECV),		\
 		 ata_opcode_name(ATA_CMD_PIO_READ),		\
-- 
cgit v1.2.3


From 5c65d8bb3503beb12864895426a69269c19e6e87 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:47 +0200
Subject: libata: Add command definitions for NCQ Encapsulation for READ LOG
 DMA EXT

ACS-4 defines an NCQ encapsulation for READ LOG DMA EXT.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ata.h    | 5 +++++
 include/linux/libata.h | 7 +++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index b84210a28a00..94ccde5ee83c 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -306,6 +306,9 @@ enum {
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
 
+	/* Subcmds for ATA_CMD_FPDMA_RECV */
+	ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT = 0x01,
+
 	/* Subcmds for ATA_CMD_FPDMA_SEND */
 	ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
 	ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,
@@ -329,7 +332,9 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET	= 0x04,
 	ATA_LOG_NCQ_SEND_RECV_DSM_TRIM		= (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET	= 0x08,
+	ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_OFFSET	= 0x0C,
+	ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x10,
 
 	/* READ/WRITE LONG (obsolete) */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a418bca0df0d..09ddb5a6f555 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1642,6 +1642,13 @@ static inline bool ata_fpdma_dsm_supported(struct ata_device *dev)
 		 ATA_LOG_NCQ_SEND_RECV_DSM_TRIM);
 }
 
+static inline bool ata_fpdma_read_log_supported(struct ata_device *dev)
+{
+	return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
+		(dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET] &
+		 ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED);
+}
+
 static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 {
 	qc->tf.ctl |= ATA_NIEN;
-- 
cgit v1.2.3


From fe5af0cc3029d52e31d282f5d53787d308e9695a Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:48 +0200
Subject: libata: Check log page directory before accessing pages

When reading the NCQ Send/Recv log it might actually not
supported, thereby causing irritating messages
'READ LOG DMA EXT failed'.
Instead we should be reading the log directory first to
figure out if the log is actually supported before trying
to access it.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 17 +++++++++++++++++
 include/linux/ata.h       |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index fa74b57ee52e..b2bd7c499653 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -66,6 +66,7 @@
 #include <scsi/scsi_host.h>
 #include <linux/libata.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
 #include <linux/cdrom.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
@@ -2083,7 +2084,23 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
 {
 	struct ata_port *ap = dev->link->ap;
 	unsigned int err_mask;
+	int log_index = ATA_LOG_NCQ_SEND_RECV * 2;
+	u16 log_pages;
 
+	err_mask = ata_read_log_page(dev, ATA_LOG_DIRECTORY,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to get Log Directory Emask 0x%x\n",
+			    err_mask);
+		return;
+	}
+	log_pages = get_unaligned_le16(&ap->sector_buf[log_index]);
+	if (!log_pages) {
+		ata_dev_warn(dev,
+			     "NCQ Send/Recv Log not supported\n");
+		return;
+	}
 	err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV,
 				     0, ap->sector_buf, 1);
 	if (err_mask) {
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 94ccde5ee83c..b5be5e85d2d3 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -314,6 +314,7 @@ enum {
 	ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,
 
 	/* READ_LOG_EXT pages */
+	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
 	ATA_LOG_NCQ_SEND_RECV	  = 0x13,
 	ATA_LOG_SATA_ID_DEV_DATA  = 0x30,
-- 
cgit v1.2.3


From a57038496422d7d21b7e41ed70d63bf0c6ff6068 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:49 +0200
Subject: libata-trace: decode subcommands

Some commands like FPDMA RECEIVE or NCQ NON DATA can encapsulate
other commands to NCQ transport. So decode the subcmds, too.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-trace.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/ata.h           | 17 +++++++++++++++++
 include/trace/events/libata.h |  7 ++++++-
 3 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index fd30b8c10cf5..99ec1e8cb95d 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -149,3 +149,46 @@ libata_trace_parse_qc_flags(struct trace_seq *p, unsigned int qc_flags)
 
 	return ret;
 }
+
+const char *
+libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
+			  unsigned char feature, unsigned char hob_nsect)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (cmd) {
+	case ATA_CMD_FPDMA_RECV:
+		switch (hob_nsect & 0x5f) {
+		case ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT:
+			trace_seq_printf(p, " READ_LOG_DMA_EXT");
+			break;
+		}
+		break;
+	case ATA_CMD_FPDMA_SEND:
+		switch (hob_nsect & 0x5f) {
+		case ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT:
+			trace_seq_printf(p, " WRITE_LOG_DMA_EXT");
+			break;
+		case ATA_SUBCMD_FPDMA_SEND_DSM:
+			trace_seq_printf(p, " DATASET_MANAGEMENT");
+			break;
+		}
+		break;
+	case ATA_CMD_NCQ_NON_DATA:
+		switch (feature) {
+		case ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE:
+			trace_seq_printf(p, " ABORT_QUEUE");
+			break;
+		case ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES:
+			trace_seq_printf(p, " SET_FEATURES");
+			break;
+		case ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT:
+			trace_seq_printf(p, " ZERO_EXT");
+			break;
+		}
+		break;
+	}
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
diff --git a/include/linux/ata.h b/include/linux/ata.h
index b5be5e85d2d3..032bb223cd8c 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -313,6 +313,11 @@ enum {
 	ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
 	ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,
 
+	/* Subcmds for ATA_CMD_NCQ_NON_DATA */
+	ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE  = 0x00,
+	ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
+	ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
+
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
@@ -338,6 +343,18 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x10,
 
+	/* NCQ Non-Data log */
+	ATA_LOG_NCQ_NON_DATA_SUBCMDS_OFFSET	= 0x00,
+	ATA_LOG_NCQ_NON_DATA_ABORT_OFFSET	= 0x00,
+	ATA_LOG_NCQ_NON_DATA_ABORT_NCQ		= (1 << 0),
+	ATA_LOG_NCQ_NON_DATA_ABORT_ALL		= (1 << 1),
+	ATA_LOG_NCQ_NON_DATA_ABORT_STREAMING	= (1 << 2),
+	ATA_LOG_NCQ_NON_DATA_ABORT_NON_STREAMING = (1 << 3),
+	ATA_LOG_NCQ_NON_DATA_ABORT_SELECTED	= (1 << 4),
+	ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET	= 0x1C,
+	ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT	= (1 << 0),
+	ATA_LOG_NCQ_NON_DATA_SIZE		= 0x40,
+
 	/* READ/WRITE LONG (obsolete) */
 	ATA_CMD_READ_LONG	= 0x22,
 	ATA_CMD_READ_LONG_ONCE	= 0x23,
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 016860320f6f..8e77572350f0 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -140,6 +140,10 @@ const char *libata_trace_parse_eh_err_mask(struct trace_seq *, unsigned int);
 const char *libata_trace_parse_qc_flags(struct trace_seq *, unsigned int);
 #define __parse_qc_flags(f) libata_trace_parse_qc_flags(p, f)
 
+const char *libata_trace_parse_subcmd(struct trace_seq *, unsigned char,
+				      unsigned char, unsigned char);
+#define __parse_subcmd(c,f,h) libata_trace_parse_subcmd(p, c, f, h)
+
 TRACE_EVENT(ata_qc_issue,
 
 	TP_PROTO(struct ata_queued_cmd *qc),
@@ -186,11 +190,12 @@ TRACE_EVENT(ata_qc_issue,
 		__entry->hob_nsect	= qc->tf.hob_nsect;
 	),
 
-	TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s " \
+	TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s%s " \
 		  " tf=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)",
 		  __entry->ata_port, __entry->ata_dev, __entry->tag,
 		  show_protocol_name(__entry->proto),
 		  show_opcode_name(__entry->cmd),
+		  __parse_subcmd(__entry->cmd, __entry->feature, __entry->hob_nsect),
 		  __entry->cmd, __entry->feature, __entry->nsect,
 		  __entry->lbal, __entry->lbam, __entry->lbah,
 		  __entry->hob_feature, __entry->hob_nsect,
-- 
cgit v1.2.3


From 3a92945b24c7ff46757a3d5d5112bfc62d2e45b2 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:51 +0200
Subject: libata: fixup ZAC device disabling

libata device disabling is ... curious. So add the correct
definitions that we can disable ZAC devices properly.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/libata.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 09ddb5a6f555..92297cd111f6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -192,7 +192,8 @@ enum {
 	ATA_DEV_SEMB		= 7,	/* SEMB */
 	ATA_DEV_SEMB_UNSUP	= 8,	/* SEMB (unsupported) */
 	ATA_DEV_ZAC		= 9,	/* ZAC device */
-	ATA_DEV_NONE		= 10,	/* no device */
+	ATA_DEV_ZAC_UNSUP	= 10,	/* ZAC device (unsupported) */
+	ATA_DEV_NONE		= 11,	/* no device */
 
 	/* struct ata_link flags */
 	ATA_LFLAG_NO_HRST	= (1 << 1), /* avoid hardreset */
@@ -1524,7 +1525,8 @@ static inline unsigned int ata_class_enabled(unsigned int class)
 static inline unsigned int ata_class_disabled(unsigned int class)
 {
 	return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP ||
-		class == ATA_DEV_PMP_UNSUP || class == ATA_DEV_SEMB_UNSUP;
+		class == ATA_DEV_PMP_UNSUP || class == ATA_DEV_SEMB_UNSUP ||
+		class == ATA_DEV_ZAC_UNSUP;
 }
 
 static inline unsigned int ata_class_absent(unsigned int class)
-- 
cgit v1.2.3


From 28a3fc2295a744a0d2ddf86b2ccdf03fbab123f9 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:52 +0200
Subject: libata: implement ZBC IN translation

ZAC drives implement a 'ZAC Management In' command template,
which maps onto the ZBC IN command.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c       |   1 +
 drivers/ata/libata-scsi.c     | 157 ++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-trace.c    |  10 +++
 include/linux/ata.h           |  10 ++-
 include/linux/libata.h        |   7 ++
 include/trace/events/libata.h |   1 +
 6 files changed, 185 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index e81661981a09..ee6c572d2a4a 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2482,6 +2482,7 @@ const char *ata_get_cmd_descript(u8 command)
 		{ ATA_CMD_CFA_WRITE_MULT_NE,	"CFA WRITE MULTIPLE WITHOUT ERASE" },
 		{ ATA_CMD_REQ_SENSE_DATA,	"REQUEST SENSE DATA EXT" },
 		{ ATA_CMD_SANITIZE_DEVICE,	"SANITIZE DEVICE" },
+		{ ATA_CMD_ZAC_MGMT_IN,		"ZAC MANAGEMENT IN" },
 		{ ATA_CMD_READ_LONG,		"READ LONG (with retries)" },
 		{ ATA_CMD_READ_LONG_ONCE,	"READ LONG (without retries)" },
 		{ ATA_CMD_WRITE_LONG,		"WRITE LONG (with retries)" },
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6d78b4b422a4..06d5a62f507d 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3317,6 +3317,160 @@ invalid_opcode:
 	return 1;
 }
 
+/**
+ *	ata_scsi_report_zones_complete - convert ATA output
+ *	@qc: command structure returning the data
+ *
+ *	Convert T-13 little-endian field representation into
+ *	T-10 big-endian field representation.
+ *	What a mess.
+ */
+static void ata_scsi_report_zones_complete(struct ata_queued_cmd *qc)
+{
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	struct sg_mapping_iter miter;
+	unsigned long flags;
+	unsigned int bytes = 0;
+
+	sg_miter_start(&miter, scsi_sglist(scmd), scsi_sg_count(scmd),
+		       SG_MITER_TO_SG | SG_MITER_ATOMIC);
+
+	local_irq_save(flags);
+	while (sg_miter_next(&miter)) {
+		unsigned int offset = 0;
+
+		if (bytes == 0) {
+			char *hdr;
+			u32 list_length;
+			u64 max_lba, opt_lba;
+			u16 same;
+
+			/* Swizzle header */
+			hdr = miter.addr;
+			list_length = get_unaligned_le32(&hdr[0]);
+			same = get_unaligned_le16(&hdr[4]);
+			max_lba = get_unaligned_le64(&hdr[8]);
+			opt_lba = get_unaligned_le64(&hdr[16]);
+			put_unaligned_be32(list_length, &hdr[0]);
+			hdr[4] = same & 0xf;
+			put_unaligned_be64(max_lba, &hdr[8]);
+			put_unaligned_be64(opt_lba, &hdr[16]);
+			offset += 64;
+			bytes += 64;
+		}
+		while (offset < miter.length) {
+			char *rec;
+			u8 cond, type, non_seq, reset;
+			u64 size, start, wp;
+
+			/* Swizzle zone descriptor */
+			rec = miter.addr + offset;
+			type = rec[0] & 0xf;
+			cond = (rec[1] >> 4) & 0xf;
+			non_seq = (rec[1] & 2);
+			reset = (rec[1] & 1);
+			size = get_unaligned_le64(&rec[8]);
+			start = get_unaligned_le64(&rec[16]);
+			wp = get_unaligned_le64(&rec[24]);
+			rec[0] = type;
+			rec[1] = (cond << 4) | non_seq | reset;
+			put_unaligned_be64(size, &rec[8]);
+			put_unaligned_be64(start, &rec[16]);
+			put_unaligned_be64(wp, &rec[24]);
+			WARN_ON(offset + 64 > miter.length);
+			offset += 64;
+			bytes += 64;
+		}
+	}
+	sg_miter_stop(&miter);
+	local_irq_restore(flags);
+
+	ata_scsi_qc_complete(qc);
+}
+
+static unsigned int ata_scsi_zbc_in_xlat(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *tf = &qc->tf;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	const u8 *cdb = scmd->cmnd;
+	u16 sect, fp = (u16)-1;
+	u8 sa, options, bp = 0xff;
+	u64 block;
+	u32 n_block;
+
+	if (unlikely(scmd->cmd_len < 16)) {
+		ata_dev_warn(qc->dev, "invalid cdb length %d\n",
+			     scmd->cmd_len);
+		fp = 15;
+		goto invalid_fld;
+	}
+	scsi_16_lba_len(cdb, &block, &n_block);
+	if (n_block != scsi_bufflen(scmd)) {
+		ata_dev_warn(qc->dev, "non-matching transfer count (%d/%d)\n",
+			     n_block, scsi_bufflen(scmd));
+		goto invalid_param_len;
+	}
+	sa = cdb[1] & 0x1f;
+	if (sa != ZI_REPORT_ZONES) {
+		ata_dev_warn(qc->dev, "invalid service action %d\n", sa);
+		fp = 1;
+		goto invalid_fld;
+	}
+	/*
+	 * ZAC allows only for transfers in 512 byte blocks,
+	 * and uses a 16 bit value for the transfer count.
+	 */
+	if ((n_block / 512) > 0xffff || n_block < 512 || (n_block % 512)) {
+		ata_dev_warn(qc->dev, "invalid transfer count %d\n", n_block);
+		goto invalid_param_len;
+	}
+	sect = n_block / 512;
+	options = cdb[14];
+
+	if (ata_ncq_enabled(qc->dev) &&
+	    ata_fpdma_zac_mgmt_in_supported(qc->dev)) {
+		tf->protocol = ATA_PROT_NCQ;
+		tf->command = ATA_CMD_FPDMA_RECV;
+		tf->hob_nsect = ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN & 0x1f;
+		tf->nsect = qc->tag << 3;
+		tf->feature = sect & 0xff;
+		tf->hob_feature = (sect >> 8) & 0xff;
+		tf->auxiliary = ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES;
+	} else {
+		tf->command = ATA_CMD_ZAC_MGMT_IN;
+		tf->feature = ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES;
+		tf->protocol = ATA_PROT_DMA;
+		tf->hob_feature = options;
+		tf->hob_nsect = (sect >> 8) & 0xff;
+		tf->nsect = sect & 0xff;
+	}
+	tf->device = ATA_LBA;
+	tf->lbah = (block >> 16) & 0xff;
+	tf->lbam = (block >> 8) & 0xff;
+	tf->lbal = block & 0xff;
+	tf->hob_lbah = (block >> 40) & 0xff;
+	tf->hob_lbam = (block >> 32) & 0xff;
+	tf->hob_lbal = (block >> 24) & 0xff;
+
+	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48;
+	qc->flags |= ATA_QCFLAG_RESULT_TF;
+
+	ata_qc_set_pc_nbytes(qc);
+
+	qc->complete_fn = ata_scsi_report_zones_complete;
+
+	return 0;
+
+invalid_fld:
+	ata_scsi_set_invalid_field(qc->dev, scmd, fp, bp);
+	return 1;
+
+invalid_param_len:
+	/* "Parameter list length error" */
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
+	return 1;
+}
+
 /**
  *	ata_mselect_caching - Simulate MODE SELECT for caching info page
  *	@qc: Storage for translated ATA taskfile
@@ -3632,6 +3786,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
 		return ata_scsi_mode_select_xlat;
 		break;
 
+	case ZBC_IN:
+		return ata_scsi_zbc_in_xlat;
+
 	case START_STOP:
 		return ata_scsi_start_stop_xlat;
 	}
diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index 99ec1e8cb95d..9caeabd69ccb 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -162,6 +162,9 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 		case ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT:
 			trace_seq_printf(p, " READ_LOG_DMA_EXT");
 			break;
+		case ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN:
+			trace_seq_printf(p, " ZAC_MGMT_IN");
+			break;
 		}
 		break;
 	case ATA_CMD_FPDMA_SEND:
@@ -187,6 +190,13 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 			break;
 		}
 		break;
+	case ATA_CMD_ZAC_MGMT_IN:
+		switch (feature) {
+		case ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES:
+			trace_seq_printf(p, " REPORT_ZONES");
+			break;
+		}
+		break;
 	}
 	trace_seq_putc(p, 0);
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 032bb223cd8c..255aa0f1c9bc 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -302,12 +302,14 @@ enum {
 	ATA_CMD_CFA_WRITE_MULT_NE = 0xCD,
 	ATA_CMD_REQ_SENSE_DATA  = 0x0B,
 	ATA_CMD_SANITIZE_DEVICE = 0xB4,
+	ATA_CMD_ZAC_MGMT_IN	= 0x4A,
 
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
 
 	/* Subcmds for ATA_CMD_FPDMA_RECV */
 	ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT = 0x01,
+	ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN    = 0x02,
 
 	/* Subcmds for ATA_CMD_FPDMA_SEND */
 	ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
@@ -318,6 +320,9 @@ enum {
 	ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
 	ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
 
+	/* Subcmds for ATA_CMD_ZAC_MGMT_IN */
+	ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,
+
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
@@ -341,7 +346,10 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_OFFSET	= 0x0C,
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
-	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x10,
+	ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET	= 0x10,
+	ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OUT_SUPPORTED = (1 << 0),
+	ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED = (1 << 1),
+	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x14,
 
 	/* NCQ Non-Data log */
 	ATA_LOG_NCQ_NON_DATA_SUBCMDS_OFFSET	= 0x00,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 92297cd111f6..c0806b60c4fa 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1651,6 +1651,13 @@ static inline bool ata_fpdma_read_log_supported(struct ata_device *dev)
 		 ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED);
 }
 
+static inline bool ata_fpdma_zac_mgmt_in_supported(struct ata_device *dev)
+{
+	return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
+		(dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET] &
+		ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED);
+}
+
 static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 {
 	qc->tf.ctl |= ATA_NIEN;
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 8e77572350f0..77370a650c15 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -98,6 +98,7 @@
 		 ata_opcode_name(ATA_CMD_CFA_WRITE_MULT_NE),	\
 		 ata_opcode_name(ATA_CMD_REQ_SENSE_DATA),	\
 		 ata_opcode_name(ATA_CMD_SANITIZE_DEVICE),	\
+		 ata_opcode_name(ATA_CMD_ZAC_MGMT_IN),		\
 		 ata_opcode_name(ATA_CMD_RESTORE),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG_ONCE),	\
-- 
cgit v1.2.3


From 27708a9579ee069c6e0ebb6e61ac1114ed1d546c Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:53 +0200
Subject: libata: Implement ZBC OUT translation

ZAC drives implement a 'ZAC Management Out' command template,
which maps onto the ZBC OUT command.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c       |  1 +
 drivers/ata/libata-scsi.c     | 67 +++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-trace.c    | 16 +++++++++++
 include/linux/ata.h           |  7 +++++
 include/trace/events/libata.h |  1 +
 5 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index ee6c572d2a4a..61dc7a99e89a 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2483,6 +2483,7 @@ const char *ata_get_cmd_descript(u8 command)
 		{ ATA_CMD_REQ_SENSE_DATA,	"REQUEST SENSE DATA EXT" },
 		{ ATA_CMD_SANITIZE_DEVICE,	"SANITIZE DEVICE" },
 		{ ATA_CMD_ZAC_MGMT_IN,		"ZAC MANAGEMENT IN" },
+		{ ATA_CMD_ZAC_MGMT_OUT,		"ZAC MANAGEMENT OUT" },
 		{ ATA_CMD_READ_LONG,		"READ LONG (with retries)" },
 		{ ATA_CMD_READ_LONG_ONCE,	"READ LONG (without retries)" },
 		{ ATA_CMD_WRITE_LONG,		"WRITE LONG (with retries)" },
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 06d5a62f507d..6afd0840ebbe 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3471,6 +3471,70 @@ invalid_param_len:
 	return 1;
 }
 
+static unsigned int ata_scsi_zbc_out_xlat(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *tf = &qc->tf;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	struct ata_device *dev = qc->dev;
+	const u8 *cdb = scmd->cmnd;
+	u8 reset_all, sa;
+	u64 block;
+	u32 n_block;
+	u16 fp = (u16)-1;
+
+	if (unlikely(scmd->cmd_len < 16)) {
+		fp = 15;
+		goto invalid_fld;
+	}
+
+	sa = cdb[1] & 0x1f;
+	if ((sa != ZO_CLOSE_ZONE) && (sa != ZO_FINISH_ZONE) &&
+	    (sa != ZO_OPEN_ZONE) && (sa != ZO_RESET_WRITE_POINTER)) {
+		fp = 1;
+		goto invalid_fld;
+	}
+
+	scsi_16_lba_len(cdb, &block, &n_block);
+	if (n_block) {
+		/*
+		 * ZAC MANAGEMENT OUT doesn't define any length
+		 */
+		goto invalid_param_len;
+	}
+	if (block > dev->n_sectors)
+		goto out_of_range;
+
+	reset_all = cdb[14] & 0x1;
+
+	tf->protocol = ATA_PROT_NODATA;
+	tf->command = ATA_CMD_ZAC_MGMT_OUT;
+	tf->feature = sa;
+	tf->hob_feature = reset_all & 0x1;
+
+	tf->lbah = (block >> 16) & 0xff;
+	tf->lbam = (block >> 8) & 0xff;
+	tf->lbal = block & 0xff;
+	tf->hob_lbah = (block >> 40) & 0xff;
+	tf->hob_lbam = (block >> 32) & 0xff;
+	tf->hob_lbal = (block >> 24) & 0xff;
+	tf->device = ATA_LBA;
+	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48;
+
+	return 0;
+
+ invalid_fld:
+	ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
+	return 1;
+ out_of_range:
+	/* "Logical Block Address out of range" */
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x00);
+	return 1;
+invalid_param_len:
+	/* "Parameter list length error" */
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
+	return 1;
+}
+
 /**
  *	ata_mselect_caching - Simulate MODE SELECT for caching info page
  *	@qc: Storage for translated ATA taskfile
@@ -3789,6 +3853,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
 	case ZBC_IN:
 		return ata_scsi_zbc_in_xlat;
 
+	case ZBC_OUT:
+		return ata_scsi_zbc_out_xlat;
+
 	case START_STOP:
 		return ata_scsi_start_stop_xlat;
 	}
diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index 9caeabd69ccb..1111ba7db5b3 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -197,6 +197,22 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 			break;
 		}
 		break;
+	case ATA_CMD_ZAC_MGMT_OUT:
+		switch (feature) {
+		case ATA_SUBCMD_ZAC_MGMT_OUT_CLOSE_ZONE:
+			trace_seq_printf(p, " CLOSE_ZONE");
+			break;
+		case ATA_SUBCMD_ZAC_MGMT_OUT_FINISH_ZONE:
+			trace_seq_printf(p, " FINISH_ZONE");
+			break;
+		case ATA_SUBCMD_ZAC_MGMT_OUT_OPEN_ZONE:
+			trace_seq_printf(p, " OPEN_ZONE");
+			break;
+		case ATA_SUBCMD_ZAC_MGMT_OUT_RESET_WRITE_POINTER:
+			trace_seq_printf(p, " RESET_WRITE_POINTER");
+			break;
+		}
+		break;
 	}
 	trace_seq_putc(p, 0);
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 255aa0f1c9bc..9d7c47075ebc 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -303,6 +303,7 @@ enum {
 	ATA_CMD_REQ_SENSE_DATA  = 0x0B,
 	ATA_CMD_SANITIZE_DEVICE = 0xB4,
 	ATA_CMD_ZAC_MGMT_IN	= 0x4A,
+	ATA_CMD_ZAC_MGMT_OUT	= 0x9F,
 
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
@@ -323,6 +324,12 @@ enum {
 	/* Subcmds for ATA_CMD_ZAC_MGMT_IN */
 	ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,
 
+	/* Subcmds for ATA_CMD_ZAC_MGMT_OUT */
+	ATA_SUBCMD_ZAC_MGMT_OUT_CLOSE_ZONE = 0x01,
+	ATA_SUBCMD_ZAC_MGMT_OUT_FINISH_ZONE = 0x02,
+	ATA_SUBCMD_ZAC_MGMT_OUT_OPEN_ZONE = 0x03,
+	ATA_SUBCMD_ZAC_MGMT_OUT_RESET_WRITE_POINTER = 0x04,
+
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 77370a650c15..75fff8696bae 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -99,6 +99,7 @@
 		 ata_opcode_name(ATA_CMD_REQ_SENSE_DATA),	\
 		 ata_opcode_name(ATA_CMD_SANITIZE_DEVICE),	\
 		 ata_opcode_name(ATA_CMD_ZAC_MGMT_IN),		\
+		 ata_opcode_name(ATA_CMD_ZAC_MGMT_OUT),		\
 		 ata_opcode_name(ATA_CMD_RESTORE),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG_ONCE),	\
-- 
cgit v1.2.3


From 284b3b77ea883234dadb2cbf97b145c3c30fe4bd Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:54 +0200
Subject: libata: NCQ encapsulation for ZAC MANAGEMENT OUT

Add NCQ encapsulation for ZAC MANAGEMENT OUT and evaluate
NCQ Non-Data log pages to figure out if NCQ encapsulation
is supported.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c  | 43 ++++++++++++++++++++++++++++++++++++++++---
 drivers/ata/libata-scsi.c  | 18 +++++++++++++-----
 drivers/ata/libata-trace.c |  3 +++
 include/linux/ata.h        |  7 +++++++
 include/linux/libata.h     |  7 +++++++
 5 files changed, 70 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b2bd7c499653..1528c7cc0089 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2121,6 +2121,40 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
 	}
 }
 
+static void ata_dev_config_ncq_non_data(struct ata_device *dev)
+{
+	struct ata_port *ap = dev->link->ap;
+	unsigned int err_mask;
+	int log_index = ATA_LOG_NCQ_NON_DATA * 2;
+	u16 log_pages;
+
+	err_mask = ata_read_log_page(dev, ATA_LOG_DIRECTORY,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to get Log Directory Emask 0x%x\n",
+			    err_mask);
+		return;
+	}
+	log_pages = get_unaligned_le16(&ap->sector_buf[log_index]);
+	if (!log_pages) {
+		ata_dev_warn(dev,
+			     "NCQ Send/Recv Log not supported\n");
+		return;
+	}
+	err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to get NCQ Non-Data Log Emask 0x%x\n",
+			    err_mask);
+	} else {
+		u8 *cmds = dev->ncq_non_data_cmds;
+
+		memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE);
+	}
+}
+
 static int ata_dev_config_ncq(struct ata_device *dev,
 			       char *desc, size_t desc_sz)
 {
@@ -2165,9 +2199,12 @@ static int ata_dev_config_ncq(struct ata_device *dev,
 		snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth,
 			ddepth, aa_desc);
 
-	if ((ap->flags & ATA_FLAG_FPDMA_AUX) &&
-	    ata_id_has_ncq_send_and_recv(dev->id))
-		ata_dev_config_ncq_send_recv(dev);
+	if ((ap->flags & ATA_FLAG_FPDMA_AUX)) {
+		if (ata_id_has_ncq_send_and_recv(dev->id))
+			ata_dev_config_ncq_send_recv(dev);
+		if (ata_id_has_ncq_non_data(dev->id))
+			ata_dev_config_ncq_non_data(dev);
+	}
 
 	return 0;
 }
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6afd0840ebbe..43403aa0801f 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3506,11 +3506,19 @@ static unsigned int ata_scsi_zbc_out_xlat(struct ata_queued_cmd *qc)
 
 	reset_all = cdb[14] & 0x1;
 
-	tf->protocol = ATA_PROT_NODATA;
-	tf->command = ATA_CMD_ZAC_MGMT_OUT;
-	tf->feature = sa;
-	tf->hob_feature = reset_all & 0x1;
-
+	if (ata_ncq_enabled(qc->dev) &&
+	    ata_fpdma_zac_mgmt_out_supported(qc->dev)) {
+		tf->protocol = ATA_PROT_NCQ;
+		tf->command = ATA_CMD_NCQ_NON_DATA;
+		tf->hob_nsect = ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT;
+		tf->nsect = qc->tag << 3;
+		tf->auxiliary = sa | (reset_all & 0x1) << 8;
+	} else {
+		tf->protocol = ATA_PROT_NODATA;
+		tf->command = ATA_CMD_ZAC_MGMT_OUT;
+		tf->feature = sa;
+		tf->hob_feature = reset_all & 0x1;
+	}
 	tf->lbah = (block >> 16) & 0xff;
 	tf->lbam = (block >> 8) & 0xff;
 	tf->lbal = block & 0xff;
diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index 1111ba7db5b3..f8c550df0615 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -188,6 +188,9 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 		case ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT:
 			trace_seq_printf(p, " ZERO_EXT");
 			break;
+		case ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT:
+			trace_seq_printf(p, " ZAC_MGMT_OUT");
+			break;
 		}
 		break;
 	case ATA_CMD_ZAC_MGMT_IN:
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 9d7c47075ebc..e62703201e84 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -320,6 +320,7 @@ enum {
 	ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE  = 0x00,
 	ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
 	ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
+	ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT = 0x07,
 
 	/* Subcmds for ATA_CMD_ZAC_MGMT_IN */
 	ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,
@@ -333,6 +334,7 @@ enum {
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
+	ATA_LOG_NCQ_NON_DATA	  = 0x12,
 	ATA_LOG_NCQ_SEND_RECV	  = 0x13,
 	ATA_LOG_SATA_ID_DEV_DATA  = 0x30,
 	ATA_LOG_SATA_SETTINGS	  = 0x08,
@@ -877,6 +879,11 @@ static inline bool ata_id_has_ncq_send_and_recv(const u16 *id)
 	return id[ATA_ID_SATA_CAPABILITY_2] & BIT(6);
 }
 
+static inline bool ata_id_has_ncq_non_data(const u16 *id)
+{
+	return id[ATA_ID_SATA_CAPABILITY_2] & BIT(5);
+}
+
 static inline bool ata_id_has_trim(const u16 *id)
 {
 	if (ata_id_major_version(id) >= 7 &&
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c0806b60c4fa..0019d4b51b11 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -729,6 +729,7 @@ struct ata_device {
 
 	/* NCQ send and receive log subcommand support */
 	u8			ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_SIZE];
+	u8			ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_SIZE];
 
 	/* error history */
 	int			spdn_cnt;
@@ -1658,6 +1659,12 @@ static inline bool ata_fpdma_zac_mgmt_in_supported(struct ata_device *dev)
 		ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED);
 }
 
+static inline bool ata_fpdma_zac_mgmt_out_supported(struct ata_device *dev)
+{
+	return (dev->ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET] &
+		ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT);
+}
+
 static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 {
 	qc->tf.ctl |= ATA_NIEN;
-- 
cgit v1.2.3


From 856c4663930988118d9f355aad66811dd6df06de Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:55 +0200
Subject: libata: support device-managed ZAC devices

Device-managed ZAC devices just set the zoned capabilities field
in INQUIRY byte 69 (cf ACS-4). This corresponds to the 'zoned'
field in the block device characteristics VPD page.
As this is only defined in SPC-5/SBC-4 we also need to update
the supported SCSI version descriptor.

Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-scsi.c | 19 ++++++++++---------
 include/linux/ata.h       |  5 +++++
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 43403aa0801f..96abd42c9985 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2082,14 +2082,14 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 		0x00,
 		0xA0,	/* SAM-5 (no version claimed) */
 
-		0x04,
-		0xC0,	/* SBC-3 (no version claimed) */
+		0x06,
+		0x00,	/* SBC-4 (no version claimed) */
 
-		0x04,
-		0x60,	/* SPC-4 (no version claimed) */
+		0x05,
+		0xC0,	/* SPC-5 (no version claimed) */
 
 		0x60,
-		0x20,   /* ZBC (no version claimed) */
+		0x24,   /* ZBC r05 */
 	};
 
 	u8 hdr[] = {
@@ -2109,10 +2109,8 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 	    (args->dev->link->ap->pflags & ATA_PFLAG_EXTERNAL))
 		hdr[1] |= (1 << 7);
 
-	if (args->dev->class == ATA_DEV_ZAC) {
+	if (args->dev->class == ATA_DEV_ZAC)
 		hdr[0] = TYPE_ZBC;
-		hdr[2] = 0x6; /* ZBC is defined in SPC-4 */
-	}
 
 	memcpy(rbuf, hdr, sizeof(hdr));
 	memcpy(&rbuf[8], "ATA     ", 8);
@@ -2126,7 +2124,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 	if (rbuf[32] == 0 || rbuf[32] == ' ')
 		memcpy(&rbuf[32], "n/a ", 4);
 
-	if (args->dev->class == ATA_DEV_ZAC)
+	if (ata_id_zoned_cap(args->id) || args->dev->class == ATA_DEV_ZAC)
 		memcpy(rbuf + 58, versions_zbc, sizeof(versions_zbc));
 	else
 		memcpy(rbuf + 58, versions, sizeof(versions));
@@ -2322,12 +2320,15 @@ static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
 {
 	int form_factor = ata_id_form_factor(args->id);
 	int media_rotation_rate = ata_id_rotation_rate(args->id);
+	u8 zoned = ata_id_zoned_cap(args->id);
 
 	rbuf[1] = 0xb1;
 	rbuf[3] = 0x3c;
 	rbuf[4] = media_rotation_rate >> 8;
 	rbuf[5] = media_rotation_rate;
 	rbuf[7] = form_factor;
+	if (zoned)
+		rbuf[8] = (zoned << 4);
 
 	return 0;
 }
diff --git a/include/linux/ata.h b/include/linux/ata.h
index e62703201e84..ac1cb9310dea 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -935,6 +935,11 @@ static inline bool ata_id_is_ssd(const u16 *id)
 	return id[ATA_ID_ROT_SPEED] == 0x01;
 }
 
+static inline u8 ata_id_zoned_cap(const u16 *id)
+{
+	return (id[ATA_ID_ADDITIONAL_SUPP] & 0x3);
+}
+
 static inline bool ata_id_pio_need_iordy(const u16 *id, const u8 pio)
 {
 	/* CF spec. r4.1 Table 22 says no IORDY on PIO5 and PIO6. */
-- 
cgit v1.2.3


From 6d1003ae8db228b74ef61536364cd2a1bd973dd8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:56 +0200
Subject: libata: support host-aware and host-managed ZAC devices

Byte 69 bits 0:1 in the IDENTIFY DEVICE data indicate a
host-aware ZAC device.
Host-managed ZAC devices have their own individual signature,
and to not set the bits in the IDENTIFY DEVICE data.
And whenever we detect a ZAC-compatible device we should
be displaying the zoned block characteristics VPD page.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-scsi.c | 38 ++++++++++++++++++-
 include/linux/ata.h       |  1 +
 include/linux/libata.h    |  7 ++++
 4 files changed, 138 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 1528c7cc0089..97f31707b570 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2227,6 +2227,99 @@ static void ata_dev_config_sense_reporting(struct ata_device *dev)
 	}
 }
 
+static void ata_dev_config_zac(struct ata_device *dev)
+{
+	struct ata_port *ap = dev->link->ap;
+	unsigned int err_mask;
+	u8 *identify_buf = ap->sector_buf;
+	int log_index = ATA_LOG_SATA_ID_DEV_DATA * 2, i, found = 0;
+	u16 log_pages;
+
+	dev->zac_zones_optimal_open = U32_MAX;
+	dev->zac_zones_optimal_nonseq = U32_MAX;
+	dev->zac_zones_max_open = U32_MAX;
+
+	/*
+	 * Always set the 'ZAC' flag for Host-managed devices.
+	 */
+	if (dev->class == ATA_DEV_ZAC)
+		dev->flags |= ATA_DFLAG_ZAC;
+	else if (ata_id_zoned_cap(dev->id) == 0x01)
+		/*
+		 * Check for host-aware devices.
+		 */
+		dev->flags |= ATA_DFLAG_ZAC;
+
+	if (!(dev->flags & ATA_DFLAG_ZAC))
+		return;
+
+	/*
+	 * Read Log Directory to figure out if IDENTIFY DEVICE log
+	 * is supported.
+	 */
+	err_mask = ata_read_log_page(dev, ATA_LOG_DIRECTORY,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_info(dev,
+			     "failed to get Log Directory Emask 0x%x\n",
+			     err_mask);
+		return;
+	}
+	log_pages = get_unaligned_le16(&ap->sector_buf[log_index]);
+	if (log_pages == 0) {
+		ata_dev_warn(dev,
+			     "ATA Identify Device Log not supported\n");
+		return;
+	}
+	/*
+	 * Read IDENTIFY DEVICE data log, page 0, to figure out
+	 * if page 9 is supported.
+	 */
+	err_mask = ata_read_log_page(dev, ATA_LOG_SATA_ID_DEV_DATA, 0,
+				     identify_buf, 1);
+	if (err_mask) {
+		ata_dev_info(dev,
+			     "failed to get Device Identify Log Emask 0x%x\n",
+			     err_mask);
+		return;
+	}
+	log_pages = identify_buf[8];
+	for (i = 0; i < log_pages; i++) {
+		if (identify_buf[9 + i] == ATA_LOG_ZONED_INFORMATION) {
+			found++;
+			break;
+		}
+	}
+	if (!found) {
+		ata_dev_warn(dev,
+			     "ATA Zoned Information Log not supported\n");
+		return;
+	}
+
+	/*
+	 * Read IDENTIFY DEVICE data log, page 9 (Zoned-device information)
+	 */
+	err_mask = ata_read_log_page(dev, ATA_LOG_SATA_ID_DEV_DATA,
+				     ATA_LOG_ZONED_INFORMATION,
+				     identify_buf, 1);
+	if (!err_mask) {
+		u64 zoned_cap, opt_open, opt_nonseq, max_open;
+
+		zoned_cap = get_unaligned_le64(&identify_buf[8]);
+		if ((zoned_cap >> 63))
+			dev->zac_zoned_cap = (zoned_cap & 1);
+		opt_open = get_unaligned_le64(&identify_buf[24]);
+		if ((opt_open >> 63))
+			dev->zac_zones_optimal_open = (u32)opt_open;
+		opt_nonseq = get_unaligned_le64(&identify_buf[32]);
+		if ((opt_nonseq >> 63))
+			dev->zac_zones_optimal_nonseq = (u32)opt_nonseq;
+		max_open = get_unaligned_le64(&identify_buf[40]);
+		if ((max_open >> 63))
+			dev->zac_zones_max_open = (u32)max_open;
+	}
+}
+
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@dev: Target device to configure
@@ -2450,6 +2543,7 @@ int ata_dev_configure(struct ata_device *dev)
 				}
 		}
 		ata_dev_config_sense_reporting(dev);
+		ata_dev_config_zac(dev);
 		dev->cdb_len = 16;
 	}
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 96abd42c9985..b86af1416dce 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2144,6 +2144,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
  */
 static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 {
+	int num_pages;
 	const u8 pages[] = {
 		0x00,	/* page 0x00, this page */
 		0x80,	/* page 0x80, unit serial no page */
@@ -2152,10 +2153,14 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 		0xb0,	/* page 0xb0, block limits page */
 		0xb1,	/* page 0xb1, block device characteristics page */
 		0xb2,	/* page 0xb2, thin provisioning page */
+		0xb6,	/* page 0xb6, zoned block device characteristics */
 	};
 
-	rbuf[3] = sizeof(pages);	/* number of supported VPD pages */
-	memcpy(rbuf + 4, pages, sizeof(pages));
+	num_pages = sizeof(pages);
+	if (!(args->dev->flags & ATA_DFLAG_ZAC))
+		num_pages--;
+	rbuf[3] = num_pages;	/* number of supported VPD pages */
+	memcpy(rbuf + 4, pages, num_pages);
 	return 0;
 }
 
@@ -2343,6 +2348,26 @@ static unsigned int ata_scsiop_inq_b2(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 }
 
+static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
+{
+	/*
+	 * zbc-r05 SCSI Zoned Block device characteristics VPD page
+	 */
+	rbuf[1] = 0xb6;
+	rbuf[3] = 0x3C;
+
+	/*
+	 * URSWRZ bit is only meaningful for host-managed ZAC drives
+	 */
+	if (args->dev->zac_zoned_cap & 1)
+		rbuf[4] |= 1;
+	put_unaligned_be32(args->dev->zac_zones_optimal_open, &rbuf[8]);
+	put_unaligned_be32(args->dev->zac_zones_optimal_nonseq, &rbuf[12]);
+	put_unaligned_be32(args->dev->zac_zones_max_open, &rbuf[16]);
+
+	return 0;
+}
+
 /**
  *	ata_scsiop_noop - Command handler that simply returns success.
  *	@args: device IDENTIFY data / SCSI command of interest.
@@ -2661,6 +2686,9 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 				rbuf[14] |= 0x40; /* LBPRZ */
 			}
 		}
+		if (ata_id_zoned_cap(args->id) ||
+		    args->dev->class == ATA_DEV_ZAC)
+			rbuf[12] = (1 << 4); /* RC_BASIS */
 	}
 	return 0;
 }
@@ -4046,6 +4074,12 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		case 0xb2:
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
 			break;
+		case 0xb6:
+			if (dev->flags & ATA_DFLAG_ZAC) {
+				ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6);
+				break;
+			}
+			/* Fallthrough */
 		default:
 			ata_scsi_invalid_field(dev, cmd, 2);
 			break;
diff --git a/include/linux/ata.h b/include/linux/ata.h
index ac1cb9310dea..83e2a99866c2 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -338,6 +338,7 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV	  = 0x13,
 	ATA_LOG_SATA_ID_DEV_DATA  = 0x30,
 	ATA_LOG_SATA_SETTINGS	  = 0x08,
+	ATA_LOG_ZONED_INFORMATION = 0x09,
 	ATA_LOG_DEVSLP_OFFSET	  = 0x30,
 	ATA_LOG_DEVSLP_SIZE	  = 0x08,
 	ATA_LOG_DEVSLP_MDAT	  = 0x00,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0019d4b51b11..d15c19e331d1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -181,6 +181,7 @@ enum {
 	ATA_DFLAG_DEVSLP	= (1 << 27), /* device supports Device Sleep */
 	ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */
 	ATA_DFLAG_D_SENSE	= (1 << 29), /* Descriptor sense requested */
+	ATA_DFLAG_ZAC		= (1 << 30), /* ZAC device */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
@@ -731,6 +732,12 @@ struct ata_device {
 	u8			ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_SIZE];
 	u8			ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_SIZE];
 
+	/* ZAC zone configuration */
+	u32			zac_zoned_cap;
+	u32			zac_zones_optimal_open;
+	u32			zac_zones_optimal_nonseq;
+	u32			zac_zones_max_open;
+
 	/* error history */
 	int			spdn_cnt;
 	/* ering is CLEAR_END, read comment above CLEAR_END */
-- 
cgit v1.2.3


From e5366a266a8cd4cd6b0fe66876462cca2e1c6a89 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 6 May 2016 08:37:41 -0700
Subject: mtd: spi-nor: support GigaDevice gd25lq64c

Also note the GigaDevice JEDEC ID.

No write-protect support yet, since this flash uses a different status
register layout.

Cc: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Marek Vasut <marex@denx.de>
---
 drivers/mtd/spi-nor/spi-nor.c | 1 +
 include/linux/mtd/spi-nor.h   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 157841dc3e99..c52e45594bfd 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -832,6 +832,7 @@ static const struct flash_info spi_nor_ids[] = {
 	/* GigaDevice */
 	{ "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64, SECT_4K) },
 	{ "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
+	{ "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
 	{ "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) },
 
 	/* Intel/Numonyx -- xxxs33b */
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 3c36113a88e1..7f041bd88b82 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -21,6 +21,7 @@
  * Sometimes these are the same as CFI IDs, but sometimes they aren't.
  */
 #define SNOR_MFR_ATMEL		CFI_MFR_ATMEL
+#define SNOR_MFR_GIGADEVICE	0xc8
 #define SNOR_MFR_INTEL		CFI_MFR_INTEL
 #define SNOR_MFR_MICRON		CFI_MFR_ST /* ST Micro <--> Micron */
 #define SNOR_MFR_MACRONIX	CFI_MFR_MACRONIX
-- 
cgit v1.2.3


From 496aec577b5183716ed9d8bcc853ad9003485fe8 Mon Sep 17 00:00:00 2001
From: Christian Daudt <csd@broadcom.com>
Date: Wed, 4 May 2016 17:55:20 -0700
Subject: brcmfmac: Add 4356 sdio support

This adds support for the 4356-sdio wireless chip.

Signed-off-by: Christian Daudt <csd@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c   | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c   | 4 +++-
 include/linux/mmc/sdio_ids.h                              | 1 +
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 2fc0597f2cd0..c7550dab6a23 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -1098,6 +1098,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356),
 	{ /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
index 0e8f2a079907..d3fd6b1db1d9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
@@ -1333,6 +1333,7 @@ bool brcmf_chip_sr_capable(struct brcmf_chip *pub)
 
 	switch (pub->chip) {
 	case BRCM_CC_4354_CHIP_ID:
+	case BRCM_CC_4356_CHIP_ID:
 		/* explicitly check SR engine enable bit */
 		pmu_cc3_mask = BIT(2);
 		/* fall-through */
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 4252fa82b89c..67e69bff2545 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -609,6 +609,7 @@ BRCMF_FW_NVRAM_DEF(4339, "brcmfmac4339-sdio.bin", "brcmfmac4339-sdio.txt");
 BRCMF_FW_NVRAM_DEF(43430, "brcmfmac43430-sdio.bin", "brcmfmac43430-sdio.txt");
 BRCMF_FW_NVRAM_DEF(43455, "brcmfmac43455-sdio.bin", "brcmfmac43455-sdio.txt");
 BRCMF_FW_NVRAM_DEF(4354, "brcmfmac4354-sdio.bin", "brcmfmac4354-sdio.txt");
+BRCMF_FW_NVRAM_DEF(4356, "brcmfmac4356-sdio.bin", "brcmfmac4356-sdio.txt");
 
 static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_43143_CHIP_ID, 0xFFFFFFFF, 43143),
@@ -624,7 +625,8 @@ static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4339_CHIP_ID, 0xFFFFFFFF, 4339),
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_43430_CHIP_ID, 0xFFFFFFFF, 43430),
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4345_CHIP_ID, 0xFFFFFFC0, 43455),
-	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4354_CHIP_ID, 0xFFFFFFFF, 4354)
+	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4354_CHIP_ID, 0xFFFFFFFF, 4354),
+	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356)
 };
 
 static void pkt_align(struct sk_buff *p, int len, int align)
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 83430f2ea757..0d126aeb3ec0 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -36,6 +36,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_4345		0x4345
 #define SDIO_DEVICE_ID_BROADCOM_4354		0x4354
+#define SDIO_DEVICE_ID_BROADCOM_4356		0x4356
 
 #define SDIO_VENDOR_ID_INTEL			0x0089
 #define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX	0x1402
-- 
cgit v1.2.3


From 5d749d0bbe811c10d9048cde6dfebc761713abfd Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 8 Mar 2016 09:13:52 -0800
Subject: platform/chrome: cros_ec_dev - Fix security issue

Prevent memory scribble by checking that ioctl buffer size parameters
are sane.
Without this check, on 32 bits system, if .insize = 0xffffffff - 20 and
.outsize the amount to scribble, we would overflow, allocate a small
amounts and be able to write outside of the malloc'ed area.
Adding a hard limit allows argument checking of the ioctl. With the
current EC, it is expected .insize and .outsize to be at around 512 bytes
or less.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 drivers/platform/chrome/cros_ec_dev.c   | 4 ++++
 drivers/platform/chrome/cros_ec_proto.c | 4 ++--
 include/linux/mfd/cros_ec.h             | 6 ++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c
index d45cd254ed1c..187470c8a1f6 100644
--- a/drivers/platform/chrome/cros_ec_dev.c
+++ b/drivers/platform/chrome/cros_ec_dev.c
@@ -137,6 +137,10 @@ static long ec_device_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg)
 	if (copy_from_user(&u_cmd, arg, sizeof(u_cmd)))
 		return -EFAULT;
 
+	if ((u_cmd.outsize > EC_MAX_MSG_BYTES) ||
+	    (u_cmd.insize > EC_MAX_MSG_BYTES))
+		return -EINVAL;
+
 	s_cmd = kmalloc(sizeof(*s_cmd) + max(u_cmd.outsize, u_cmd.insize),
 			GFP_KERNEL);
 	if (!s_cmd)
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 990308ca384f..b6e161f71b26 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -298,8 +298,8 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 			ec_dev->max_response = EC_PROTO2_MAX_PARAM_SIZE;
 			ec_dev->max_passthru = 0;
 			ec_dev->pkt_xfer = NULL;
-			ec_dev->din_size = EC_MSG_BYTES;
-			ec_dev->dout_size = EC_MSG_BYTES;
+			ec_dev->din_size = EC_PROTO2_MSG_BYTES;
+			ec_dev->dout_size = EC_PROTO2_MSG_BYTES;
 		} else {
 			/*
 			 * It's possible for a test to occur too early when
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index a677c2bd485c..64184d27e3cd 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -50,9 +50,11 @@ enum {
 					EC_MSG_TX_TRAILER_BYTES,
 	EC_MSG_RX_PROTO_BYTES	= 3,
 
-	/* Max length of messages */
-	EC_MSG_BYTES		= EC_PROTO2_MAX_PARAM_SIZE +
+	/* Max length of messages for proto 2*/
+	EC_PROTO2_MSG_BYTES		= EC_PROTO2_MAX_PARAM_SIZE +
 					EC_MSG_TX_PROTO_BYTES,
+
+	EC_MAX_MSG_BYTES		= 64 * 1024,
 };
 
 /*
-- 
cgit v1.2.3


From 9b9e3fc4d5a31f6050508f2404369beac4356867 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 18:11:54 +0200
Subject: KVM: remove NULL return path for vcpu ids >= KVM_MAX_VCPUS

Commit c896939f7cff ("KVM: use heuristic for fast VCPU lookup by id") added
a return path that prevents vcpu ids to exceed KVM_MAX_VCPUS. This is a
problem for powerpc where vcpu ids can grow up to 8*KVM_MAX_VCPUS.

This patch simply reverses the logic so that we only try fast path if the
vcpu id can be tried as an index in kvm->vcpus[]. The slow path is not
affected by the change.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad40d44784c7..0a0e00d9c5da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -447,12 +447,13 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 
 static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 {
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu = NULL;
 	int i;
 
-	if (id < 0 || id >= KVM_MAX_VCPUS)
+	if (id < 0)
 		return NULL;
-	vcpu = kvm_get_vcpu(kvm, id);
+	if (id < KVM_MAX_VCPUS)
+		vcpu = kvm_get_vcpu(kvm, id);
 	if (vcpu && vcpu->vcpu_id == id)
 		return vcpu;
 	kvm_for_each_vcpu(i, vcpu, kvm)
-- 
cgit v1.2.3


From 0b1b1dfd52a67f4f09a18cb82337199bc90ad7fb Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 18:13:37 +0200
Subject: kvm: introduce KVM_MAX_VCPU_ID

The KVM_MAX_VCPUS define provides the maximum number of vCPUs per guest, and
also the upper limit for vCPU ids. This is okay for all archs except PowerPC
which can have higher ids, depending on the cpu/core/thread topology. In the
worst case (single threaded guest, host with 8 threads per core), it limits
the maximum number of vCPUS to KVM_MAX_VCPUS / 8.

This patch separates the vCPU numbering from the total number of vCPUs, with
the introduction of KVM_MAX_VCPU_ID, as the maximal valid value for vCPU ids
plus one.

The corresponding KVM_CAP_MAX_VCPU_ID allows userspace to validate vCPU ids
before passing them to KVM_CREATE_VCPU.

This patch only implements KVM_MAX_VCPU_ID with a specific value for PowerPC.
Other archs continue to return KVM_MAX_VCPUS instead.

Suggested-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt   | 10 ++++++++--
 arch/powerpc/include/asm/kvm_host.h |  3 +++
 include/linux/kvm_host.h            |  4 ++++
 include/uapi/linux/kvm.h            |  1 +
 virt/kvm/kvm_main.c                 |  4 +++-
 5 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 8cc857fffcc7..a4482cce4bae 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -199,8 +199,8 @@ Type: vm ioctl
 Parameters: vcpu id (apic id on x86)
 Returns: vcpu fd on success, -1 on error
 
-This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
-in the range [0, max_vcpus).
+This API adds a vcpu to a virtual machine. No more than max_vcpus may be added.
+The vcpu id is an integer in the range [0, max_vcpu_id).
 
 The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
 the KVM_CHECK_EXTENSION ioctl() at run-time.
@@ -212,6 +212,12 @@ cpus max.
 If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
 same as the value returned from KVM_CAP_NR_VCPUS.
 
+The maximum possible value for max_vcpu_id can be retrieved using the
+KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time.
+
+If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id
+is the same as the value returned from KVM_CAP_MAX_VCPUS.
+
 On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
 threads in one or more virtual CPU cores.  (This is because the
 hardware requires all the hardware threads in a CPU core to be in the
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..a07645c17818 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -40,6 +40,9 @@
 #define KVM_MAX_VCORES		NR_CPUS
 #define KVM_USER_MEM_SLOTS	512
 
+#include <asm/cputhreads.h>
+#define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
+
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0a0e00d9c5da..352889d6e322 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,10 @@
 
 #include <asm/kvm_host.h>
 
+#ifndef KVM_MAX_VCPU_ID
+#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#endif
+
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a7f1f8032ec1..05ebf475104c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -865,6 +865,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SPAPR_TCE_64 125
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
+#define KVM_CAP_MAX_VCPU_ID 128
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4fd482fb9260..ed3d9bb18a56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2272,7 +2272,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	int r;
 	struct kvm_vcpu *vcpu;
 
-	if (id >= KVM_MAX_VCPUS)
+	if (id >= KVM_MAX_VCPU_ID)
 		return -EINVAL;
 
 	vcpu = kvm_arch_vcpu_create(kvm, id);
@@ -2746,6 +2746,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_MULTI_ADDRESS_SPACE:
 		return KVM_ADDRESS_SPACE_NUM;
 #endif
+	case KVM_CAP_MAX_VCPU_ID:
+		return KVM_MAX_VCPU_ID;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From b52f3ed02221252d8ee2c7d756e76fad4a5e84f6 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 5 May 2016 11:58:29 -0600
Subject: irqbypass: Disallow NULL token

A NULL token is meaningless and can only lead to unintended problems.
Error on registration with a NULL token, ignore de-registrations with
a NULL token.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/irqbypass.h |  4 ++--
 virt/lib/irqbypass.c      | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index 1551b5b2f4c2..f0f5d2671509 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -34,7 +34,7 @@ struct irq_bypass_consumer;
 /**
  * struct irq_bypass_producer - IRQ bypass producer definition
  * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer
+ * @token: opaque token to match between producer and consumer (non-NULL)
  * @irq: Linux IRQ number for the producer device
  * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
  * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@@ -60,7 +60,7 @@ struct irq_bypass_producer {
 /**
  * struct irq_bypass_consumer - IRQ bypass consumer definition
  * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer
+ * @token: opaque token to match between producer and consumer (non-NULL)
  * @add_producer: Connect the IRQ consumer to an IRQ producer
  * @del_producer: Disconnect the IRQ consumer from an IRQ producer
  * @stop: Perform any quiesce operations necessary prior to add/del (optional)
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index 09a03b5a21ff..52abac4bb6a2 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -89,6 +89,9 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer)
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 
+	if (!producer->token)
+		return -EINVAL;
+
 	might_sleep();
 
 	if (!try_module_get(THIS_MODULE))
@@ -136,6 +139,9 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 
+	if (!producer->token)
+		return;
+
 	might_sleep();
 
 	if (!try_module_get(THIS_MODULE))
@@ -177,7 +183,8 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 
-	if (!consumer->add_producer || !consumer->del_producer)
+	if (!consumer->token ||
+	    !consumer->add_producer || !consumer->del_producer)
 		return -EINVAL;
 
 	might_sleep();
@@ -227,6 +234,9 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 
+	if (!consumer->token)
+		return;
+
 	might_sleep();
 
 	if (!try_module_get(THIS_MODULE))
-- 
cgit v1.2.3


From 14717e2031862d9aa2512b24a7df42cf68a977ec Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 5 May 2016 11:58:35 -0600
Subject: kvm: Conditionally register IRQ bypass consumer

If we don't support a mechanism for bypassing IRQs, don't register as
a consumer.  This eliminates meaningless dev_info()s when the connect
fails between producer and consumer, such as on AMD systems where
kvm_x86_ops->update_pi_irte is not implemented

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       | 19 ++++++++-----------
 include/linux/kvm_host.h |  1 +
 virt/kvm/eventfd.c       | 18 ++++++++++--------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 57a62d1e2f5d..6c774cdf553c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8358,19 +8358,21 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
+bool kvm_arch_has_irq_bypass(void)
+{
+	return kvm_x86_ops->update_pi_irte != NULL;
+}
+
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 
-	if (kvm_x86_ops->update_pi_irte) {
-		irqfd->producer = prod;
-		return kvm_x86_ops->update_pi_irte(irqfd->kvm,
-				prod->irq, irqfd->gsi, 1);
-	}
+	irqfd->producer = prod;
 
-	return -EINVAL;
+	return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+					   prod->irq, irqfd->gsi, 1);
 }
 
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
@@ -8380,11 +8382,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 
-	if (!kvm_x86_ops->update_pi_irte) {
-		WARN_ON(irqfd->producer != NULL);
-		return;
-	}
-
 	WARN_ON(irqfd->producer != prod);
 	irqfd->producer = NULL;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 352889d6e322..92a0229044fb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1185,6 +1185,7 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+bool kvm_arch_has_irq_bypass(void);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 46dbc0a7dfc1..e469b6012471 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -408,15 +408,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 */
 	fdput(f);
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
-	irqfd->consumer.token = (void *)irqfd->eventfd;
-	irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
-	irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
-	irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
-	irqfd->consumer.start = kvm_arch_irq_bypass_start;
-	ret = irq_bypass_register_consumer(&irqfd->consumer);
-	if (ret)
-		pr_info("irq bypass consumer (token %p) registration fails: %d\n",
+	if (kvm_arch_has_irq_bypass()) {
+		irqfd->consumer.token = (void *)irqfd->eventfd;
+		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
+		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
+		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
+		irqfd->consumer.start = kvm_arch_irq_bypass_start;
+		ret = irq_bypass_register_consumer(&irqfd->consumer);
+		if (ret)
+			pr_info("irq bypass consumer (token %p) registration fails: %d\n",
 				irqfd->consumer.token, ret);
+	}
 #endif
 
 	return 0;
-- 
cgit v1.2.3


From c0bba3a99f0709c24c2c7ada7cb098966b1d791f Mon Sep 17 00:00:00 2001
From: Kedareswara rao Appana <appana.durga.rao@xilinx.com>
Date: Thu, 7 Apr 2016 10:59:43 +0530
Subject: dmaengine: vdma: Add Support for Xilinx AXI Direct Memory Access
 Engine

This patch adds support for the AXI Direct Memory Access (AXI DMA)
core in the existing vdma driver, AXI DMA Core is a
soft Xilinx IP core that provides high-bandwidth
direct memory access between memory and AXI4-Stream
type target peripherals.

Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/xilinx/xilinx_vdma.c | 474 +++++++++++++++++++++++++++++++++++----
 include/linux/dma/xilinx_dma.h   |  12 +
 2 files changed, 444 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/xilinx/xilinx_vdma.c b/drivers/dma/xilinx/xilinx_vdma.c
index 57b85af4ed08..983e4bc88cd8 100644
--- a/drivers/dma/xilinx/xilinx_vdma.c
+++ b/drivers/dma/xilinx/xilinx_vdma.c
@@ -16,6 +16,11 @@
  * video device (S2MM). Initialization, status, interrupt and management
  * registers are accessed through an AXI4-Lite slave interface.
  *
+ * The AXI Direct Memory Access (AXI DMA) core is a soft Xilinx IP core that
+ * provides high-bandwidth one dimensional direct memory access between memory
+ * and AXI4-Stream target peripherals. It supports one receive and one
+ * transmit channel, both of them optional at synthesis time.
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 2 of the License, or
@@ -140,6 +145,19 @@
 /* Delay loop counter to prevent hardware failure */
 #define XILINX_DMA_LOOP_COUNT		1000000
 
+/* AXI DMA Specific Registers/Offsets */
+#define XILINX_DMA_REG_SRCDSTADDR	0x18
+#define XILINX_DMA_REG_BTT		0x28
+
+/* AXI DMA Specific Masks/Bit fields */
+#define XILINX_DMA_MAX_TRANS_LEN	GENMASK(22, 0)
+#define XILINX_DMA_CR_COALESCE_MAX	GENMASK(23, 16)
+#define XILINX_DMA_CR_COALESCE_SHIFT	16
+#define XILINX_DMA_BD_SOP		BIT(27)
+#define XILINX_DMA_BD_EOP		BIT(26)
+#define XILINX_DMA_COALESCE_MAX		255
+#define XILINX_DMA_NUM_APP_WORDS	5
+
 /**
  * struct xilinx_vdma_desc_hw - Hardware Descriptor
  * @next_desc: Next Descriptor Pointer @0x00
@@ -161,6 +179,30 @@ struct xilinx_vdma_desc_hw {
 	u32 stride;
 } __aligned(64);
 
+/**
+ * struct xilinx_axidma_desc_hw - Hardware Descriptor for AXI DMA
+ * @next_desc: Next Descriptor Pointer @0x00
+ * @pad1: Reserved @0x04
+ * @buf_addr: Buffer address @0x08
+ * @pad2: Reserved @0x0C
+ * @pad3: Reserved @0x10
+ * @pad4: Reserved @0x14
+ * @control: Control field @0x18
+ * @status: Status field @0x1C
+ * @app: APP Fields @0x20 - 0x30
+ */
+struct xilinx_axidma_desc_hw {
+	u32 next_desc;
+	u32 pad1;
+	u32 buf_addr;
+	u32 pad2;
+	u32 pad3;
+	u32 pad4;
+	u32 control;
+	u32 status;
+	u32 app[XILINX_DMA_NUM_APP_WORDS];
+} __aligned(64);
+
 /**
  * struct xilinx_vdma_tx_segment - Descriptor segment
  * @hw: Hardware descriptor
@@ -173,6 +215,18 @@ struct xilinx_vdma_tx_segment {
 	dma_addr_t phys;
 } __aligned(64);
 
+/**
+ * struct xilinx_axidma_tx_segment - Descriptor segment
+ * @hw: Hardware descriptor
+ * @node: Node in the descriptor segments list
+ * @phys: Physical address of segment
+ */
+struct xilinx_axidma_tx_segment {
+	struct xilinx_axidma_desc_hw hw;
+	struct list_head node;
+	dma_addr_t phys;
+} __aligned(64);
+
 /**
  * struct xilinx_dma_tx_descriptor - Per Transaction structure
  * @async_tx: Async transaction descriptor
@@ -210,6 +264,9 @@ struct xilinx_dma_tx_descriptor {
  * @desc_pendingcount: Descriptor pending count
  * @ext_addr: Indicates 64 bit addressing is supported by dma channel
  * @desc_submitcount: Descriptor h/w submitted count
+ * @residue: Residue for AXI DMA
+ * @seg_v: Statically allocated segments base
+ * @start_transfer: Differentiate b/w DMA IP's transfer
  */
 struct xilinx_dma_chan {
 	struct xilinx_dma_device *xdev;
@@ -235,6 +292,9 @@ struct xilinx_dma_chan {
 	u32 desc_pendingcount;
 	bool ext_addr;
 	u32 desc_submitcount;
+	u32 residue;
+	struct xilinx_axidma_tx_segment *seg_v;
+	void (*start_transfer)(struct xilinx_dma_chan *chan);
 };
 
 /**
@@ -246,6 +306,7 @@ struct xilinx_dma_chan {
  * @has_sg: Specifies whether Scatter-Gather is present or not
  * @flush_on_fsync: Flush on frame sync
  * @ext_addr: Indicates 64 bit addressing is supported by dma device
+ * @dmatype: DMA ip type
  */
 struct xilinx_dma_device {
 	void __iomem *regs;
@@ -255,6 +316,7 @@ struct xilinx_dma_device {
 	bool has_sg;
 	u32 flush_on_fsync;
 	bool ext_addr;
+	enum xdma_ip_type dmatype;
 };
 
 /* Macros */
@@ -352,6 +414,39 @@ xilinx_vdma_alloc_tx_segment(struct xilinx_dma_chan *chan)
 	return segment;
 }
 
+/**
+ * xilinx_axidma_alloc_tx_segment - Allocate transaction segment
+ * @chan: Driver specific DMA channel
+ *
+ * Return: The allocated segment on success and NULL on failure.
+ */
+static struct xilinx_axidma_tx_segment *
+xilinx_axidma_alloc_tx_segment(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_axidma_tx_segment *segment;
+	dma_addr_t phys;
+
+	segment = dma_pool_alloc(chan->desc_pool, GFP_ATOMIC, &phys);
+	if (!segment)
+		return NULL;
+
+	memset(segment, 0, sizeof(*segment));
+	segment->phys = phys;
+
+	return segment;
+}
+
+/**
+ * xilinx_dma_free_tx_segment - Free transaction segment
+ * @chan: Driver specific DMA channel
+ * @segment: DMA transaction segment
+ */
+static void xilinx_dma_free_tx_segment(struct xilinx_dma_chan *chan,
+				struct xilinx_axidma_tx_segment *segment)
+{
+	dma_pool_free(chan->desc_pool, segment, segment->phys);
+}
+
 /**
  * xilinx_vdma_free_tx_segment - Free transaction segment
  * @chan: Driver specific DMA channel
@@ -393,13 +488,22 @@ xilinx_dma_free_tx_descriptor(struct xilinx_dma_chan *chan,
 			       struct xilinx_dma_tx_descriptor *desc)
 {
 	struct xilinx_vdma_tx_segment *segment, *next;
+	struct xilinx_axidma_tx_segment *axidma_segment, *axidma_next;
 
 	if (!desc)
 		return;
 
-	list_for_each_entry_safe(segment, next, &desc->segments, node) {
-		list_del(&segment->node);
-		xilinx_vdma_free_tx_segment(chan, segment);
+	if (chan->xdev->dmatype == XDMA_TYPE_VDMA) {
+		list_for_each_entry_safe(segment, next, &desc->segments, node) {
+			list_del(&segment->node);
+			xilinx_vdma_free_tx_segment(chan, segment);
+		}
+	} else {
+		list_for_each_entry_safe(axidma_segment, axidma_next,
+					 &desc->segments, node) {
+			list_del(&axidma_segment->node);
+			xilinx_dma_free_tx_segment(chan, axidma_segment);
+		}
 	}
 
 	kfree(desc);
@@ -451,6 +555,8 @@ static void xilinx_dma_free_chan_resources(struct dma_chan *dchan)
 	dev_dbg(chan->dev, "Free all channel resources.\n");
 
 	xilinx_dma_free_descriptors(chan);
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA)
+		xilinx_dma_free_tx_segment(chan, chan->seg_v);
 	dma_pool_destroy(chan->desc_pool);
 	chan->desc_pool = NULL;
 }
@@ -519,10 +625,20 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 	 * We need the descriptor to be aligned to 64bytes
 	 * for meeting Xilinx VDMA specification requirement.
 	 */
-	chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
-				chan->dev,
-				sizeof(struct xilinx_vdma_tx_segment),
-				__alignof__(struct xilinx_vdma_tx_segment), 0);
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		chan->desc_pool = dma_pool_create("xilinx_dma_desc_pool",
+				   chan->dev,
+				   sizeof(struct xilinx_axidma_tx_segment),
+				   __alignof__(struct xilinx_axidma_tx_segment),
+				   0);
+	} else {
+		chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
+				     chan->dev,
+				     sizeof(struct xilinx_vdma_tx_segment),
+				     __alignof__(struct xilinx_vdma_tx_segment),
+				     0);
+	}
+
 	if (!chan->desc_pool) {
 		dev_err(chan->dev,
 			"unable to allocate channel %d descriptor pool\n",
@@ -530,7 +646,27 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 		return -ENOMEM;
 	}
 
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA)
+		/*
+		 * For AXI DMA case after submitting a pending_list, keep
+		 * an extra segment allocated so that the "next descriptor"
+		 * pointer on the tail descriptor always points to a
+		 * valid descriptor, even when paused after reaching taildesc.
+		 * This way, it is possible to issue additional
+		 * transfers without halting and restarting the channel.
+		 */
+		chan->seg_v = xilinx_axidma_alloc_tx_segment(chan);
+
 	dma_cookie_init(dchan);
+
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		/* For AXI DMA resetting once channel will reset the
+		 * other channel as well so enable the interrupts here.
+		 */
+		dma_ctrl_set(chan, XILINX_DMA_REG_DMACR,
+			      XILINX_DMA_DMAXR_ALL_IRQ_MASK);
+	}
+
 	return 0;
 }
 
@@ -546,7 +682,37 @@ static enum dma_status xilinx_dma_tx_status(struct dma_chan *dchan,
 					dma_cookie_t cookie,
 					struct dma_tx_state *txstate)
 {
-	return dma_cookie_status(dchan, cookie, txstate);
+	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_dma_tx_descriptor *desc;
+	struct xilinx_axidma_tx_segment *segment;
+	struct xilinx_axidma_desc_hw *hw;
+	enum dma_status ret;
+	unsigned long flags;
+	u32 residue = 0;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE || !txstate)
+		return ret;
+
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		spin_lock_irqsave(&chan->lock, flags);
+
+		desc = list_last_entry(&chan->active_list,
+				       struct xilinx_dma_tx_descriptor, node);
+		if (chan->has_sg) {
+			list_for_each_entry(segment, &desc->segments, node) {
+				hw = &segment->hw;
+				residue += (hw->control - hw->status) &
+					   XILINX_DMA_MAX_TRANS_LEN;
+			}
+		}
+		spin_unlock_irqrestore(&chan->lock, flags);
+
+		chan->residue = residue;
+		dma_set_residue(txstate, chan->residue);
+	}
+
+	return ret;
 }
 
 /**
@@ -753,6 +919,91 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan)
 	}
 }
 
+/**
+ * xilinx_dma_start_transfer - Starts DMA transfer
+ * @chan: Driver specific channel struct pointer
+ */
+static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_dma_tx_descriptor *head_desc, *tail_desc;
+	struct xilinx_axidma_tx_segment *tail_segment, *old_head, *new_head;
+	u32 reg;
+
+	if (chan->err)
+		return;
+
+	if (list_empty(&chan->pending_list))
+		return;
+
+	/* If it is SG mode and hardware is busy, cannot submit */
+	if (chan->has_sg && xilinx_dma_is_running(chan) &&
+	    !xilinx_dma_is_idle(chan)) {
+		dev_dbg(chan->dev, "DMA controller still busy\n");
+		return;
+	}
+
+	head_desc = list_first_entry(&chan->pending_list,
+				     struct xilinx_dma_tx_descriptor, node);
+	tail_desc = list_last_entry(&chan->pending_list,
+				    struct xilinx_dma_tx_descriptor, node);
+	tail_segment = list_last_entry(&tail_desc->segments,
+				       struct xilinx_axidma_tx_segment, node);
+
+	old_head = list_first_entry(&head_desc->segments,
+				struct xilinx_axidma_tx_segment, node);
+	new_head = chan->seg_v;
+	/* Copy Buffer Descriptor fields. */
+	new_head->hw = old_head->hw;
+
+	/* Swap and save new reserve */
+	list_replace_init(&old_head->node, &new_head->node);
+	chan->seg_v = old_head;
+
+	tail_segment->hw.next_desc = chan->seg_v->phys;
+	head_desc->async_tx.phys = new_head->phys;
+
+	reg = dma_ctrl_read(chan, XILINX_DMA_REG_DMACR);
+
+	if (chan->desc_pendingcount <= XILINX_DMA_COALESCE_MAX) {
+		reg &= ~XILINX_DMA_CR_COALESCE_MAX;
+		reg |= chan->desc_pendingcount <<
+				  XILINX_DMA_CR_COALESCE_SHIFT;
+		dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, reg);
+	}
+
+	if (chan->has_sg)
+		dma_ctrl_write(chan, XILINX_DMA_REG_CURDESC,
+			       head_desc->async_tx.phys);
+
+	xilinx_dma_start(chan);
+
+	if (chan->err)
+		return;
+
+	/* Start the transfer */
+	if (chan->has_sg) {
+		dma_ctrl_write(chan, XILINX_DMA_REG_TAILDESC,
+			       tail_segment->phys);
+	} else {
+		struct xilinx_axidma_tx_segment *segment;
+		struct xilinx_axidma_desc_hw *hw;
+
+		segment = list_first_entry(&head_desc->segments,
+					   struct xilinx_axidma_tx_segment,
+					   node);
+		hw = &segment->hw;
+
+		dma_ctrl_write(chan, XILINX_DMA_REG_SRCDSTADDR, hw->buf_addr);
+
+		/* Start the transfer */
+		dma_ctrl_write(chan, XILINX_DMA_REG_BTT,
+			       hw->control & XILINX_DMA_MAX_TRANS_LEN);
+	}
+
+	list_splice_tail_init(&chan->pending_list, &chan->active_list);
+	chan->desc_pendingcount = 0;
+}
+
 /**
  * xilinx_dma_issue_pending - Issue pending transactions
  * @dchan: DMA channel
@@ -763,7 +1014,7 @@ static void xilinx_dma_issue_pending(struct dma_chan *dchan)
 	unsigned long flags;
 
 	spin_lock_irqsave(&chan->lock, flags);
-	xilinx_vdma_start_transfer(chan);
+	chan->start_transfer(chan);
 	spin_unlock_irqrestore(&chan->lock, flags);
 }
 
@@ -895,7 +1146,7 @@ static irqreturn_t xilinx_dma_irq_handler(int irq, void *data)
 	if (status & XILINX_DMA_DMASR_FRM_CNT_IRQ) {
 		spin_lock(&chan->lock);
 		xilinx_dma_complete_descriptor(chan);
-		xilinx_vdma_start_transfer(chan);
+		chan->start_transfer(chan);
 		spin_unlock(&chan->lock);
 	}
 
@@ -913,6 +1164,7 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 {
 	struct xilinx_vdma_tx_segment *tail_segment;
 	struct xilinx_dma_tx_descriptor *tail_desc;
+	struct xilinx_axidma_tx_segment *axidma_tail_segment;
 
 	if (list_empty(&chan->pending_list))
 		goto append;
@@ -923,9 +1175,17 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 	 */
 	tail_desc = list_last_entry(&chan->pending_list,
 				    struct xilinx_dma_tx_descriptor, node);
-	tail_segment = list_last_entry(&tail_desc->segments,
-				       struct xilinx_vdma_tx_segment, node);
-	tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	if (chan->xdev->dmatype == XDMA_TYPE_VDMA) {
+		tail_segment = list_last_entry(&tail_desc->segments,
+					       struct xilinx_vdma_tx_segment,
+					       node);
+		tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	} else {
+		axidma_tail_segment = list_last_entry(&tail_desc->segments,
+					       struct xilinx_axidma_tx_segment,
+					       node);
+		axidma_tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	}
 
 	/*
 	 * Add the software descriptor and all children to the list
@@ -935,7 +1195,7 @@ append:
 	list_add_tail(&desc->node, &chan->pending_list);
 	chan->desc_pendingcount++;
 
-	if (chan->has_sg &&
+	if (chan->has_sg && (chan->xdev->dmatype == XDMA_TYPE_VDMA) &&
 	    unlikely(chan->desc_pendingcount > chan->num_frms)) {
 		dev_dbg(chan->dev, "desc pendingcount is too high\n");
 		chan->desc_pendingcount = chan->num_frms;
@@ -1062,6 +1322,109 @@ error:
 	return NULL;
 }
 
+/**
+ * xilinx_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
+ * @dchan: DMA channel
+ * @sgl: scatterlist to transfer to/from
+ * @sg_len: number of entries in @scatterlist
+ * @direction: DMA direction
+ * @flags: transfer ack flags
+ * @context: APP words of the descriptor
+ *
+ * Return: Async transaction descriptor on success and NULL on failure
+ */
+static struct dma_async_tx_descriptor *xilinx_dma_prep_slave_sg(
+	struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
+	enum dma_transfer_direction direction, unsigned long flags,
+	void *context)
+{
+	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_dma_tx_descriptor *desc;
+	struct xilinx_axidma_tx_segment *segment = NULL, *prev = NULL;
+	u32 *app_w = (u32 *)context;
+	struct scatterlist *sg;
+	size_t copy;
+	size_t sg_used;
+	unsigned int i;
+
+	if (!is_slave_direction(direction))
+		return NULL;
+
+	/* Allocate a transaction descriptor. */
+	desc = xilinx_dma_alloc_tx_descriptor(chan);
+	if (!desc)
+		return NULL;
+
+	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
+	desc->async_tx.tx_submit = xilinx_dma_tx_submit;
+
+	/* Build transactions using information in the scatter gather list */
+	for_each_sg(sgl, sg, sg_len, i) {
+		sg_used = 0;
+
+		/* Loop until the entire scatterlist entry is used */
+		while (sg_used < sg_dma_len(sg)) {
+			struct xilinx_axidma_desc_hw *hw;
+
+			/* Get a free segment */
+			segment = xilinx_axidma_alloc_tx_segment(chan);
+			if (!segment)
+				goto error;
+
+			/*
+			 * Calculate the maximum number of bytes to transfer,
+			 * making sure it is less than the hw limit
+			 */
+			copy = min_t(size_t, sg_dma_len(sg) - sg_used,
+				     XILINX_DMA_MAX_TRANS_LEN);
+			hw = &segment->hw;
+
+			/* Fill in the descriptor */
+			hw->buf_addr = sg_dma_address(sg) + sg_used;
+
+			hw->control = copy;
+
+			if (chan->direction == DMA_MEM_TO_DEV) {
+				if (app_w)
+					memcpy(hw->app, app_w, sizeof(u32) *
+					       XILINX_DMA_NUM_APP_WORDS);
+			}
+
+			if (prev)
+				prev->hw.next_desc = segment->phys;
+
+			prev = segment;
+			sg_used += copy;
+
+			/*
+			 * Insert the segment into the descriptor segments
+			 * list.
+			 */
+			list_add_tail(&segment->node, &desc->segments);
+		}
+	}
+
+	segment = list_first_entry(&desc->segments,
+				   struct xilinx_axidma_tx_segment, node);
+	desc->async_tx.phys = segment->phys;
+	prev->hw.next_desc = segment->phys;
+
+	/* For the last DMA_MEM_TO_DEV transfer, set EOP */
+	if (chan->direction == DMA_MEM_TO_DEV) {
+		segment->hw.control |= XILINX_DMA_BD_SOP;
+		segment = list_last_entry(&desc->segments,
+					  struct xilinx_axidma_tx_segment,
+					  node);
+		segment->hw.control |= XILINX_DMA_BD_EOP;
+	}
+
+	return &desc->async_tx;
+
+error:
+	xilinx_dma_free_tx_descriptor(chan, desc);
+	return NULL;
+}
+
 /**
  * xilinx_dma_terminate_all - Halt the channel and free descriptors
  * @chan: Driver specific DMA Channel pointer
@@ -1224,22 +1587,26 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
 		chan->id = 0;
 
 		chan->ctrl_offset = XILINX_DMA_MM2S_CTRL_OFFSET;
-		chan->desc_offset = XILINX_VDMA_MM2S_DESC_OFFSET;
+		if (xdev->dmatype == XDMA_TYPE_VDMA) {
+			chan->desc_offset = XILINX_VDMA_MM2S_DESC_OFFSET;
 
-		if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
-		    xdev->flush_on_fsync == XILINX_DMA_FLUSH_MM2S)
-			chan->flush_on_fsync = true;
+			if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
+			    xdev->flush_on_fsync == XILINX_DMA_FLUSH_MM2S)
+				chan->flush_on_fsync = true;
+		}
 	} else if (of_device_is_compatible(node,
 					    "xlnx,axi-vdma-s2mm-channel")) {
 		chan->direction = DMA_DEV_TO_MEM;
 		chan->id = 1;
 
 		chan->ctrl_offset = XILINX_DMA_S2MM_CTRL_OFFSET;
-		chan->desc_offset = XILINX_VDMA_S2MM_DESC_OFFSET;
+		if (xdev->dmatype == XDMA_TYPE_VDMA) {
+			chan->desc_offset = XILINX_VDMA_S2MM_DESC_OFFSET;
 
-		if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
-		    xdev->flush_on_fsync == XILINX_DMA_FLUSH_S2MM)
-			chan->flush_on_fsync = true;
+			if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
+			    xdev->flush_on_fsync == XILINX_DMA_FLUSH_S2MM)
+				chan->flush_on_fsync = true;
+		}
 	} else {
 		dev_err(xdev->dev, "Invalid channel compatible node\n");
 		return -EINVAL;
@@ -1254,6 +1621,11 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
 		return err;
 	}
 
+	if (xdev->dmatype == XDMA_TYPE_AXIDMA)
+		chan->start_transfer = xilinx_dma_start_transfer;
+	else
+		chan->start_transfer = xilinx_vdma_start_transfer;
+
 	/* Initialize the tasklet */
 	tasklet_init(&chan->tasklet, xilinx_dma_do_tasklet,
 			(unsigned long)chan);
@@ -1296,6 +1668,15 @@ static struct dma_chan *of_dma_xilinx_xlate(struct of_phandle_args *dma_spec,
 	return dma_get_slave_channel(&xdev->chan[chan_id]->common);
 }
 
+static const struct of_device_id xilinx_dma_of_ids[] = {
+	{ .compatible = "xlnx,axi-dma-1.00.a",
+	  .data = (void *)XDMA_TYPE_AXIDMA },
+	{ .compatible = "xlnx,axi-vdma-1.00.a",
+	  .data = (void *)XDMA_TYPE_VDMA },
+	{}
+};
+MODULE_DEVICE_TABLE(of, xilinx_dma_of_ids);
+
 /**
  * xilinx_dma_probe - Driver probe function
  * @pdev: Pointer to the platform_device structure
@@ -1317,6 +1698,7 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	xdev->dev = &pdev->dev;
+	xdev->dmatype = (enum xdma_ip_type)of_device_get_match_data(&pdev->dev);
 
 	/* Request and map I/O memory */
 	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -1327,16 +1709,21 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 	/* Retrieve the DMA engine properties from the device tree */
 	xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg");
 
-	err = of_property_read_u32(node, "xlnx,num-fstores", &num_frames);
-	if (err < 0) {
-		dev_err(xdev->dev, "missing xlnx,num-fstores property\n");
-		return err;
-	}
+	if (xdev->dmatype == XDMA_TYPE_VDMA) {
+		err = of_property_read_u32(node, "xlnx,num-fstores",
+					   &num_frames);
+		if (err < 0) {
+			dev_err(xdev->dev,
+				"missing xlnx,num-fstores property\n");
+			return err;
+		}
 
-	err = of_property_read_u32(node, "xlnx,flush-fsync",
-					&xdev->flush_on_fsync);
-	if (err < 0)
-		dev_warn(xdev->dev, "missing xlnx,flush-fsync property\n");
+		err = of_property_read_u32(node, "xlnx,flush-fsync",
+					   &xdev->flush_on_fsync);
+		if (err < 0)
+			dev_warn(xdev->dev,
+				 "missing xlnx,flush-fsync property\n");
+	}
 
 	err = of_property_read_u32(node, "xlnx,addrwidth", &addr_width);
 	if (err < 0)
@@ -1361,11 +1748,18 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 				xilinx_dma_alloc_chan_resources;
 	xdev->common.device_free_chan_resources =
 				xilinx_dma_free_chan_resources;
-	xdev->common.device_prep_interleaved_dma =
-				xilinx_vdma_dma_prep_interleaved;
 	xdev->common.device_terminate_all = xilinx_dma_terminate_all;
 	xdev->common.device_tx_status = xilinx_dma_tx_status;
 	xdev->common.device_issue_pending = xilinx_dma_issue_pending;
+	if (xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		xdev->common.device_prep_slave_sg = xilinx_dma_prep_slave_sg;
+		/* Residue calculation is supported by only AXI DMA */
+		xdev->common.residue_granularity =
+					  DMA_RESIDUE_GRANULARITY_SEGMENT;
+	} else {
+		xdev->common.device_prep_interleaved_dma =
+				xilinx_vdma_dma_prep_interleaved;
+	}
 
 	platform_set_drvdata(pdev, xdev);
 
@@ -1376,9 +1770,11 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 			goto error;
 	}
 
-	for (i = 0; i < XILINX_DMA_MAX_CHANS_PER_DEVICE; i++)
-		if (xdev->chan[i])
-			xdev->chan[i]->num_frms = num_frames;
+	if (xdev->dmatype == XDMA_TYPE_VDMA) {
+		for (i = 0; i < XILINX_DMA_MAX_CHANS_PER_DEVICE; i++)
+			if (xdev->chan[i])
+				xdev->chan[i]->num_frms = num_frames;
+	}
 
 	/* Register the DMA engine with the core */
 	dma_async_device_register(&xdev->common);
@@ -1425,12 +1821,6 @@ static int xilinx_dma_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct of_device_id xilinx_dma_of_ids[] = {
-	{ .compatible = "xlnx,axi-vdma-1.00.a",},
-	{}
-};
-MODULE_DEVICE_TABLE(of, xilinx_dma_of_ids);
-
 static struct platform_driver xilinx_vdma_driver = {
 	.driver = {
 		.name = "xilinx-vdma",
diff --git a/include/linux/dma/xilinx_dma.h b/include/linux/dma/xilinx_dma.h
index 34b98f276ed0..5db17ff8e254 100644
--- a/include/linux/dma/xilinx_dma.h
+++ b/include/linux/dma/xilinx_dma.h
@@ -41,6 +41,18 @@ struct xilinx_vdma_config {
 	int ext_fsync;
 };
 
+/**
+ * enum xdma_ip_type: DMA IP type.
+ *
+ * XDMA_TYPE_AXIDMA: Axi dma ip.
+ * XDMA_TYPE_VDMA: Axi vdma ip.
+ *
+ */
+enum xdma_ip_type {
+	XDMA_TYPE_AXIDMA = 0,
+	XDMA_TYPE_VDMA,
+};
+
 int xilinx_vdma_channel_set_config(struct dma_chan *dchan,
 					struct xilinx_vdma_config *cfg);
 
-- 
cgit v1.2.3


From 07b0e7d49cbcadebad9d3b986f3298e33286dea2 Mon Sep 17 00:00:00 2001
From: Kedareswara rao Appana <appana.durga.rao@xilinx.com>
Date: Thu, 7 Apr 2016 10:59:45 +0530
Subject: dmaengine: vdma: Add Support for Xilinx AXI Central Direct Memory
 Access Engine

This patch adds support for the AXI Central Direct Memory Access
(AXI CDMA) core to the existing vdma driver, AXI CDMA is a
soft Xilinx IP core that provides high-bandwidth
Direct Memory Access(DMA) between a memory-mapped
source address and a memory-mapped destination address.

Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/xilinx/xilinx_vdma.c | 236 ++++++++++++++++++++++++++++++++++++++-
 include/linux/dma/xilinx_dma.h   |   2 +
 2 files changed, 236 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/xilinx/xilinx_vdma.c b/drivers/dma/xilinx/xilinx_vdma.c
index 983e4bc88cd8..fb481135f27a 100644
--- a/drivers/dma/xilinx/xilinx_vdma.c
+++ b/drivers/dma/xilinx/xilinx_vdma.c
@@ -21,6 +21,10 @@
  * and AXI4-Stream target peripherals. It supports one receive and one
  * transmit channel, both of them optional at synthesis time.
  *
+ * The AXI CDMA, is a soft IP, which provides high-bandwidth Direct Memory
+ * Access (DMA) between a memory-mapped source address and a memory-mapped
+ * destination address.
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 2 of the License, or
@@ -158,6 +162,13 @@
 #define XILINX_DMA_COALESCE_MAX		255
 #define XILINX_DMA_NUM_APP_WORDS	5
 
+/* AXI CDMA Specific Registers/Offsets */
+#define XILINX_CDMA_REG_SRCADDR		0x18
+#define XILINX_CDMA_REG_DSTADDR		0x20
+
+/* AXI CDMA Specific Masks */
+#define XILINX_CDMA_CR_SGMODE          BIT(3)
+
 /**
  * struct xilinx_vdma_desc_hw - Hardware Descriptor
  * @next_desc: Next Descriptor Pointer @0x00
@@ -203,6 +214,28 @@ struct xilinx_axidma_desc_hw {
 	u32 app[XILINX_DMA_NUM_APP_WORDS];
 } __aligned(64);
 
+/**
+ * struct xilinx_cdma_desc_hw - Hardware Descriptor
+ * @next_desc: Next Descriptor Pointer @0x00
+ * @pad1: Reserved @0x04
+ * @src_addr: Source address @0x08
+ * @pad2: Reserved @0x0C
+ * @dest_addr: Destination address @0x10
+ * @pad3: Reserved @0x14
+ * @control: Control field @0x18
+ * @status: Status field @0x1C
+ */
+struct xilinx_cdma_desc_hw {
+	u32 next_desc;
+	u32 pad1;
+	u32 src_addr;
+	u32 pad2;
+	u32 dest_addr;
+	u32 pad3;
+	u32 control;
+	u32 status;
+} __aligned(64);
+
 /**
  * struct xilinx_vdma_tx_segment - Descriptor segment
  * @hw: Hardware descriptor
@@ -227,6 +260,18 @@ struct xilinx_axidma_tx_segment {
 	dma_addr_t phys;
 } __aligned(64);
 
+/**
+ * struct xilinx_cdma_tx_segment - Descriptor segment
+ * @hw: Hardware descriptor
+ * @node: Node in the descriptor segments list
+ * @phys: Physical address of segment
+ */
+struct xilinx_cdma_tx_segment {
+	struct xilinx_cdma_desc_hw hw;
+	struct list_head node;
+	dma_addr_t phys;
+} __aligned(64);
+
 /**
  * struct xilinx_dma_tx_descriptor - Per Transaction structure
  * @async_tx: Async transaction descriptor
@@ -414,6 +459,28 @@ xilinx_vdma_alloc_tx_segment(struct xilinx_dma_chan *chan)
 	return segment;
 }
 
+/**
+ * xilinx_cdma_alloc_tx_segment - Allocate transaction segment
+ * @chan: Driver specific DMA channel
+ *
+ * Return: The allocated segment on success and NULL on failure.
+ */
+static struct xilinx_cdma_tx_segment *
+xilinx_cdma_alloc_tx_segment(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_cdma_tx_segment *segment;
+	dma_addr_t phys;
+
+	segment = dma_pool_alloc(chan->desc_pool, GFP_ATOMIC, &phys);
+	if (!segment)
+		return NULL;
+
+	memset(segment, 0, sizeof(*segment));
+	segment->phys = phys;
+
+	return segment;
+}
+
 /**
  * xilinx_axidma_alloc_tx_segment - Allocate transaction segment
  * @chan: Driver specific DMA channel
@@ -447,6 +514,17 @@ static void xilinx_dma_free_tx_segment(struct xilinx_dma_chan *chan,
 	dma_pool_free(chan->desc_pool, segment, segment->phys);
 }
 
+/**
+ * xilinx_cdma_free_tx_segment - Free transaction segment
+ * @chan: Driver specific DMA channel
+ * @segment: DMA transaction segment
+ */
+static void xilinx_cdma_free_tx_segment(struct xilinx_dma_chan *chan,
+				struct xilinx_cdma_tx_segment *segment)
+{
+	dma_pool_free(chan->desc_pool, segment, segment->phys);
+}
+
 /**
  * xilinx_vdma_free_tx_segment - Free transaction segment
  * @chan: Driver specific DMA channel
@@ -488,6 +566,7 @@ xilinx_dma_free_tx_descriptor(struct xilinx_dma_chan *chan,
 			       struct xilinx_dma_tx_descriptor *desc)
 {
 	struct xilinx_vdma_tx_segment *segment, *next;
+	struct xilinx_cdma_tx_segment *cdma_segment, *cdma_next;
 	struct xilinx_axidma_tx_segment *axidma_segment, *axidma_next;
 
 	if (!desc)
@@ -498,6 +577,12 @@ xilinx_dma_free_tx_descriptor(struct xilinx_dma_chan *chan,
 			list_del(&segment->node);
 			xilinx_vdma_free_tx_segment(chan, segment);
 		}
+	} else if (chan->xdev->dmatype == XDMA_TYPE_CDMA) {
+		list_for_each_entry_safe(cdma_segment, cdma_next,
+					 &desc->segments, node) {
+			list_del(&cdma_segment->node);
+			xilinx_cdma_free_tx_segment(chan, cdma_segment);
+		}
 	} else {
 		list_for_each_entry_safe(axidma_segment, axidma_next,
 					 &desc->segments, node) {
@@ -631,6 +716,12 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 				   sizeof(struct xilinx_axidma_tx_segment),
 				   __alignof__(struct xilinx_axidma_tx_segment),
 				   0);
+	} else if (chan->xdev->dmatype == XDMA_TYPE_CDMA) {
+		chan->desc_pool = dma_pool_create("xilinx_cdma_desc_pool",
+				   chan->dev,
+				   sizeof(struct xilinx_cdma_tx_segment),
+				   __alignof__(struct xilinx_cdma_tx_segment),
+				   0);
 	} else {
 		chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
 				     chan->dev,
@@ -667,6 +758,10 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 			      XILINX_DMA_DMAXR_ALL_IRQ_MASK);
 	}
 
+	if ((chan->xdev->dmatype == XDMA_TYPE_CDMA) && chan->has_sg)
+		dma_ctrl_set(chan, XILINX_DMA_REG_DMACR,
+			     XILINX_CDMA_CR_SGMODE);
+
 	return 0;
 }
 
@@ -919,6 +1014,66 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan)
 	}
 }
 
+/**
+ * xilinx_cdma_start_transfer - Starts cdma transfer
+ * @chan: Driver specific channel struct pointer
+ */
+static void xilinx_cdma_start_transfer(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_dma_tx_descriptor *head_desc, *tail_desc;
+	struct xilinx_cdma_tx_segment *tail_segment;
+	u32 ctrl_reg = dma_read(chan, XILINX_DMA_REG_DMACR);
+
+	if (chan->err)
+		return;
+
+	if (list_empty(&chan->pending_list))
+		return;
+
+	head_desc = list_first_entry(&chan->pending_list,
+				     struct xilinx_dma_tx_descriptor, node);
+	tail_desc = list_last_entry(&chan->pending_list,
+				    struct xilinx_dma_tx_descriptor, node);
+	tail_segment = list_last_entry(&tail_desc->segments,
+				       struct xilinx_cdma_tx_segment, node);
+
+	if (chan->desc_pendingcount <= XILINX_DMA_COALESCE_MAX) {
+		ctrl_reg &= ~XILINX_DMA_CR_COALESCE_MAX;
+		ctrl_reg |= chan->desc_pendingcount <<
+				XILINX_DMA_CR_COALESCE_SHIFT;
+		dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, ctrl_reg);
+	}
+
+	if (chan->has_sg) {
+		dma_ctrl_write(chan, XILINX_DMA_REG_CURDESC,
+			   head_desc->async_tx.phys);
+
+		/* Update tail ptr register which will start the transfer */
+		dma_ctrl_write(chan, XILINX_DMA_REG_TAILDESC,
+			       tail_segment->phys);
+	} else {
+		/* In simple mode */
+		struct xilinx_cdma_tx_segment *segment;
+		struct xilinx_cdma_desc_hw *hw;
+
+		segment = list_first_entry(&head_desc->segments,
+					   struct xilinx_cdma_tx_segment,
+					   node);
+
+		hw = &segment->hw;
+
+		dma_ctrl_write(chan, XILINX_CDMA_REG_SRCADDR, hw->src_addr);
+		dma_ctrl_write(chan, XILINX_CDMA_REG_DSTADDR, hw->dest_addr);
+
+		/* Start the transfer */
+		dma_ctrl_write(chan, XILINX_DMA_REG_BTT,
+				hw->control & XILINX_DMA_MAX_TRANS_LEN);
+	}
+
+	list_splice_tail_init(&chan->pending_list, &chan->active_list);
+	chan->desc_pendingcount = 0;
+}
+
 /**
  * xilinx_dma_start_transfer - Starts DMA transfer
  * @chan: Driver specific channel struct pointer
@@ -1165,6 +1320,7 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 	struct xilinx_vdma_tx_segment *tail_segment;
 	struct xilinx_dma_tx_descriptor *tail_desc;
 	struct xilinx_axidma_tx_segment *axidma_tail_segment;
+	struct xilinx_cdma_tx_segment *cdma_tail_segment;
 
 	if (list_empty(&chan->pending_list))
 		goto append;
@@ -1180,6 +1336,11 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 					       struct xilinx_vdma_tx_segment,
 					       node);
 		tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	} else if (chan->xdev->dmatype == XDMA_TYPE_CDMA) {
+		cdma_tail_segment = list_last_entry(&tail_desc->segments,
+						struct xilinx_cdma_tx_segment,
+						node);
+		cdma_tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
 	} else {
 		axidma_tail_segment = list_last_entry(&tail_desc->segments,
 					       struct xilinx_axidma_tx_segment,
@@ -1322,6 +1483,68 @@ error:
 	return NULL;
 }
 
+/**
+ * xilinx_cdma_prep_memcpy - prepare descriptors for a memcpy transaction
+ * @dchan: DMA channel
+ * @dma_dst: destination address
+ * @dma_src: source address
+ * @len: transfer length
+ * @flags: transfer ack flags
+ *
+ * Return: Async transaction descriptor on success and NULL on failure
+ */
+static struct dma_async_tx_descriptor *
+xilinx_cdma_prep_memcpy(struct dma_chan *dchan, dma_addr_t dma_dst,
+			dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_dma_tx_descriptor *desc;
+	struct xilinx_cdma_tx_segment *segment, *prev;
+	struct xilinx_cdma_desc_hw *hw;
+
+	if (!len || len > XILINX_DMA_MAX_TRANS_LEN)
+		return NULL;
+
+	desc = xilinx_dma_alloc_tx_descriptor(chan);
+	if (!desc)
+		return NULL;
+
+	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
+	desc->async_tx.tx_submit = xilinx_dma_tx_submit;
+
+	/* Allocate the link descriptor from DMA pool */
+	segment = xilinx_cdma_alloc_tx_segment(chan);
+	if (!segment)
+		goto error;
+
+	hw = &segment->hw;
+	hw->control = len;
+	hw->src_addr = dma_src;
+	hw->dest_addr = dma_dst;
+
+	/* Fill the previous next descriptor with current */
+	prev = list_last_entry(&desc->segments,
+			       struct xilinx_cdma_tx_segment, node);
+	prev->hw.next_desc = segment->phys;
+
+	/* Insert the segment into the descriptor segments list. */
+	list_add_tail(&segment->node, &desc->segments);
+
+	prev = segment;
+
+	/* Link the last hardware descriptor with the first. */
+	segment = list_first_entry(&desc->segments,
+				struct xilinx_cdma_tx_segment, node);
+	desc->async_tx.phys = segment->phys;
+	prev->hw.next_desc = segment->phys;
+
+	return &desc->async_tx;
+
+error:
+	xilinx_dma_free_tx_descriptor(chan, desc);
+	return NULL;
+}
+
 /**
  * xilinx_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
  * @dchan: DMA channel
@@ -1623,6 +1846,8 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
 
 	if (xdev->dmatype == XDMA_TYPE_AXIDMA)
 		chan->start_transfer = xilinx_dma_start_transfer;
+	else if (xdev->dmatype == XDMA_TYPE_CDMA)
+		chan->start_transfer = xilinx_cdma_start_transfer;
 	else
 		chan->start_transfer = xilinx_vdma_start_transfer;
 
@@ -1671,6 +1896,8 @@ static struct dma_chan *of_dma_xilinx_xlate(struct of_phandle_args *dma_spec,
 static const struct of_device_id xilinx_dma_of_ids[] = {
 	{ .compatible = "xlnx,axi-dma-1.00.a",
 	  .data = (void *)XDMA_TYPE_AXIDMA },
+	{ .compatible = "xlnx,axi-cdma-1.00.a",
+	  .data = (void *)XDMA_TYPE_CDMA },
 	{ .compatible = "xlnx,axi-vdma-1.00.a",
 	  .data = (void *)XDMA_TYPE_VDMA },
 	{}
@@ -1741,8 +1968,10 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 	xdev->common.dev = &pdev->dev;
 
 	INIT_LIST_HEAD(&xdev->common.channels);
-	dma_cap_set(DMA_SLAVE, xdev->common.cap_mask);
-	dma_cap_set(DMA_PRIVATE, xdev->common.cap_mask);
+	if (!(xdev->dmatype == XDMA_TYPE_CDMA)) {
+		dma_cap_set(DMA_SLAVE, xdev->common.cap_mask);
+		dma_cap_set(DMA_PRIVATE, xdev->common.cap_mask);
+	}
 
 	xdev->common.device_alloc_chan_resources =
 				xilinx_dma_alloc_chan_resources;
@@ -1756,6 +1985,9 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 		/* Residue calculation is supported by only AXI DMA */
 		xdev->common.residue_granularity =
 					  DMA_RESIDUE_GRANULARITY_SEGMENT;
+	} else if (xdev->dmatype == XDMA_TYPE_CDMA) {
+		dma_cap_set(DMA_MEMCPY, xdev->common.cap_mask);
+		xdev->common.device_prep_dma_memcpy = xilinx_cdma_prep_memcpy;
 	} else {
 		xdev->common.device_prep_interleaved_dma =
 				xilinx_vdma_dma_prep_interleaved;
diff --git a/include/linux/dma/xilinx_dma.h b/include/linux/dma/xilinx_dma.h
index 5db17ff8e254..3ae300052553 100644
--- a/include/linux/dma/xilinx_dma.h
+++ b/include/linux/dma/xilinx_dma.h
@@ -45,11 +45,13 @@ struct xilinx_vdma_config {
  * enum xdma_ip_type: DMA IP type.
  *
  * XDMA_TYPE_AXIDMA: Axi dma ip.
+ * XDMA_TYPE_CDMA: Axi cdma ip.
  * XDMA_TYPE_VDMA: Axi vdma ip.
  *
  */
 enum xdma_ip_type {
 	XDMA_TYPE_AXIDMA = 0,
+	XDMA_TYPE_CDMA,
 	XDMA_TYPE_VDMA,
 };
 
-- 
cgit v1.2.3


From c5076cfe768998e9d395bc8486b29b18b0f99fd9 Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tn@semihalf.com>
Date: Wed, 11 May 2016 17:34:51 -0500
Subject: PCI, of: Move PCI I/O space management to PCI core code

No functional changes in this patch.

PCI I/O space mapping code does not depend on OF; therefore it can be moved
to PCI core code.  This way we will be able to use it, e.g., in ACPI PCI
code.

Suggested-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Tomasz Nowicki <tn@semihalf.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Arnd Bergmann <arnd@arndb.de>
CC: Liviu Dudau <Liviu.Dudau@arm.com>
---
 drivers/of/address.c       | 116 +--------------------------------------------
 drivers/pci/pci.c          | 115 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of_address.h |   9 ----
 include/linux/pci.h        |   5 ++
 4 files changed, 121 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 91a469d55b8f..0a553c084a81 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -4,6 +4,7 @@
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/pci.h>
 #include <linux/pci_regs.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
@@ -673,121 +674,6 @@ const __be32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-#ifdef PCI_IOBASE
-struct io_range {
-	struct list_head list;
-	phys_addr_t start;
-	resource_size_t size;
-};
-
-static LIST_HEAD(io_range_list);
-static DEFINE_SPINLOCK(io_range_lock);
-#endif
-
-/*
- * Record the PCI IO range (expressed as CPU physical address + size).
- * Return a negative value if an error has occured, zero otherwise
- */
-int __weak pci_register_io_range(phys_addr_t addr, resource_size_t size)
-{
-	int err = 0;
-
-#ifdef PCI_IOBASE
-	struct io_range *range;
-	resource_size_t allocated_size = 0;
-
-	/* check if the range hasn't been previously recorded */
-	spin_lock(&io_range_lock);
-	list_for_each_entry(range, &io_range_list, list) {
-		if (addr >= range->start && addr + size <= range->start + size) {
-			/* range already registered, bail out */
-			goto end_register;
-		}
-		allocated_size += range->size;
-	}
-
-	/* range not registed yet, check for available space */
-	if (allocated_size + size - 1 > IO_SPACE_LIMIT) {
-		/* if it's too big check if 64K space can be reserved */
-		if (allocated_size + SZ_64K - 1 > IO_SPACE_LIMIT) {
-			err = -E2BIG;
-			goto end_register;
-		}
-
-		size = SZ_64K;
-		pr_warn("Requested IO range too big, new size set to 64K\n");
-	}
-
-	/* add the range to the list */
-	range = kzalloc(sizeof(*range), GFP_ATOMIC);
-	if (!range) {
-		err = -ENOMEM;
-		goto end_register;
-	}
-
-	range->start = addr;
-	range->size = size;
-
-	list_add_tail(&range->list, &io_range_list);
-
-end_register:
-	spin_unlock(&io_range_lock);
-#endif
-
-	return err;
-}
-
-phys_addr_t pci_pio_to_address(unsigned long pio)
-{
-	phys_addr_t address = (phys_addr_t)OF_BAD_ADDR;
-
-#ifdef PCI_IOBASE
-	struct io_range *range;
-	resource_size_t allocated_size = 0;
-
-	if (pio > IO_SPACE_LIMIT)
-		return address;
-
-	spin_lock(&io_range_lock);
-	list_for_each_entry(range, &io_range_list, list) {
-		if (pio >= allocated_size && pio < allocated_size + range->size) {
-			address = range->start + pio - allocated_size;
-			break;
-		}
-		allocated_size += range->size;
-	}
-	spin_unlock(&io_range_lock);
-#endif
-
-	return address;
-}
-
-unsigned long __weak pci_address_to_pio(phys_addr_t address)
-{
-#ifdef PCI_IOBASE
-	struct io_range *res;
-	resource_size_t offset = 0;
-	unsigned long addr = -1;
-
-	spin_lock(&io_range_lock);
-	list_for_each_entry(res, &io_range_list, list) {
-		if (address >= res->start && address < res->start + res->size) {
-			addr = address - res->start + offset;
-			break;
-		}
-		offset += res->size;
-	}
-	spin_unlock(&io_range_lock);
-
-	return addr;
-#else
-	if (address > IO_SPACE_LIMIT)
-		return (unsigned long)-1;
-
-	return (unsigned long) address;
-#endif
-}
-
 static int __of_address_to_resource(struct device_node *dev,
 		const __be32 *addrp, u64 size, unsigned int flags,
 		const char *name, struct resource *r)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..bc0c914b8afc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3021,6 +3021,121 @@ int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name)
 }
 EXPORT_SYMBOL(pci_request_regions_exclusive);
 
+#ifdef PCI_IOBASE
+struct io_range {
+	struct list_head list;
+	phys_addr_t start;
+	resource_size_t size;
+};
+
+static LIST_HEAD(io_range_list);
+static DEFINE_SPINLOCK(io_range_lock);
+#endif
+
+/*
+ * Record the PCI IO range (expressed as CPU physical address + size).
+ * Return a negative value if an error has occured, zero otherwise
+ */
+int __weak pci_register_io_range(phys_addr_t addr, resource_size_t size)
+{
+	int err = 0;
+
+#ifdef PCI_IOBASE
+	struct io_range *range;
+	resource_size_t allocated_size = 0;
+
+	/* check if the range hasn't been previously recorded */
+	spin_lock(&io_range_lock);
+	list_for_each_entry(range, &io_range_list, list) {
+		if (addr >= range->start && addr + size <= range->start + size) {
+			/* range already registered, bail out */
+			goto end_register;
+		}
+		allocated_size += range->size;
+	}
+
+	/* range not registed yet, check for available space */
+	if (allocated_size + size - 1 > IO_SPACE_LIMIT) {
+		/* if it's too big check if 64K space can be reserved */
+		if (allocated_size + SZ_64K - 1 > IO_SPACE_LIMIT) {
+			err = -E2BIG;
+			goto end_register;
+		}
+
+		size = SZ_64K;
+		pr_warn("Requested IO range too big, new size set to 64K\n");
+	}
+
+	/* add the range to the list */
+	range = kzalloc(sizeof(*range), GFP_ATOMIC);
+	if (!range) {
+		err = -ENOMEM;
+		goto end_register;
+	}
+
+	range->start = addr;
+	range->size = size;
+
+	list_add_tail(&range->list, &io_range_list);
+
+end_register:
+	spin_unlock(&io_range_lock);
+#endif
+
+	return err;
+}
+
+phys_addr_t pci_pio_to_address(unsigned long pio)
+{
+	phys_addr_t address = (phys_addr_t)OF_BAD_ADDR;
+
+#ifdef PCI_IOBASE
+	struct io_range *range;
+	resource_size_t allocated_size = 0;
+
+	if (pio > IO_SPACE_LIMIT)
+		return address;
+
+	spin_lock(&io_range_lock);
+	list_for_each_entry(range, &io_range_list, list) {
+		if (pio >= allocated_size && pio < allocated_size + range->size) {
+			address = range->start + pio - allocated_size;
+			break;
+		}
+		allocated_size += range->size;
+	}
+	spin_unlock(&io_range_lock);
+#endif
+
+	return address;
+}
+
+unsigned long __weak pci_address_to_pio(phys_addr_t address)
+{
+#ifdef PCI_IOBASE
+	struct io_range *res;
+	resource_size_t offset = 0;
+	unsigned long addr = -1;
+
+	spin_lock(&io_range_lock);
+	list_for_each_entry(res, &io_range_list, list) {
+		if (address >= res->start && address < res->start + res->size) {
+			addr = address - res->start + offset;
+			break;
+		}
+		offset += res->size;
+	}
+	spin_unlock(&io_range_lock);
+
+	return addr;
+#else
+	if (address > IO_SPACE_LIMIT)
+		return (unsigned long)-1;
+
+	return (unsigned long) address;
+#endif
+}
+
 /**
  *	pci_remap_iospace - Remap the memory mapped I/O space
  *	@res: Resource describing the I/O space
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 01c0a556448b..37864734ca50 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -47,10 +47,6 @@ void __iomem *of_io_request_and_map(struct device_node *device,
 extern const __be32 *of_get_address(struct device_node *dev, int index,
 			   u64 *size, unsigned int *flags);
 
-extern int pci_register_io_range(phys_addr_t addr, resource_size_t size);
-extern unsigned long pci_address_to_pio(phys_addr_t addr);
-extern phys_addr_t pci_pio_to_address(unsigned long pio);
-
 extern int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node);
 extern struct of_pci_range *of_pci_range_parser_one(
@@ -86,11 +82,6 @@ static inline const __be32 *of_get_address(struct device_node *dev, int index,
 	return NULL;
 }
 
-static inline phys_addr_t pci_pio_to_address(unsigned long pio)
-{
-	return 0;
-}
-
 static inline int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..1824ef80e10d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1164,6 +1164,9 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
 			void *alignf_data);
 
 
+int pci_register_io_range(phys_addr_t addr, resource_size_t size);
+unsigned long pci_address_to_pio(phys_addr_t addr);
+phys_addr_t pci_pio_to_address(unsigned long pio);
 int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr);
 
 static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
@@ -1480,6 +1483,8 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name)
 { return -EIO; }
 static inline void pci_release_regions(struct pci_dev *dev) { }
 
+static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
+
 static inline void pci_block_cfg_access(struct pci_dev *dev) { }
 static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev)
 { return 0; }
-- 
cgit v1.2.3


From c16aea129bf788127465771cab97134a94af33e4 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 13 Apr 2016 19:11:03 +0300
Subject: net/mlx5: Fix mlx5 ifc cmd_hca_cap bad offsets

All reserved fields after early_vf_enable are off by 1, since
early_vf_enable was not explicitly declared as array of size 1.

Reserved field before cqe_zip had a wrong size, it should
be 0x80 + 0x3f.

Fixes: b0844444590e ("net/mlx5_core: Introduce access function to read internal timer ")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 107 ++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c15b8a864937..c300e7491d80 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -750,21 +750,21 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
 	u8         eswitch_flow_table[0x1];
-	u8	   early_vf_enable;
-	u8         reserved_at_1a8[0x2];
+	u8	   early_vf_enable[0x1];
+	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         reserved_at_1af[0x6];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
-	u8         reserved_at_1bf[0x3];
+	u8         reserved_at_1c0[0x3];
 	u8         log_max_msg[0x5];
-	u8         reserved_at_1c7[0x4];
+	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1cf[0x6];
+	u8         reserved_at_1d0[0x6];
 	u8         rol_s[0x1];
 	u8         rol_g[0x1];
-	u8         reserved_at_1d7[0x1];
+	u8         reserved_at_1d8[0x1];
 	u8         wol_s[0x1];
 	u8         wol_g[0x1];
 	u8         wol_a[0x1];
@@ -774,47 +774,47 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         wol_p[0x1];
 
 	u8         stat_rate_support[0x10];
-	u8         reserved_at_1ef[0xc];
+	u8         reserved_at_1f0[0xc];
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
 	u8         reserved_at_200[0x3];
 	u8         ipoib_basic_offloads[0x1];
-	u8         reserved_at_204[0xa];
+	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
 	u8         cmdif_checksum[0x2];
 	u8         sigerr_cqe[0x1];
-	u8         reserved_at_212[0x1];
+	u8         reserved_at_213[0x1];
 	u8         wq_signature[0x1];
 	u8         sctr_data_cqe[0x1];
-	u8         reserved_at_215[0x1];
+	u8         reserved_at_216[0x1];
 	u8         sho[0x1];
 	u8         tph[0x1];
 	u8         rf[0x1];
 	u8         dct[0x1];
-	u8         reserved_at_21a[0x1];
+	u8         reserved_at_21b[0x1];
 	u8         eth_net_offloads[0x1];
 	u8         roce[0x1];
 	u8         atomic[0x1];
-	u8         reserved_at_21e[0x1];
+	u8         reserved_at_21f[0x1];
 
 	u8         cq_oi[0x1];
 	u8         cq_resize[0x1];
 	u8         cq_moderation[0x1];
-	u8         reserved_at_222[0x3];
+	u8         reserved_at_223[0x3];
 	u8         cq_eq_remap[0x1];
 	u8         pg[0x1];
 	u8         block_lb_mc[0x1];
-	u8         reserved_at_228[0x1];
+	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
 	u8         reserved_at_22a[0x1];
 	u8         cd[0x1];
-	u8         reserved_at_22c[0x1];
+	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
 	u8         reserved_at_22f[0x1];
 	u8	   imaicl[0x1];
-	u8         reserved_at_231[0x4];
+	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
 	u8         pkv[0x1];
 	u8         set_deth_sqpn[0x1];
@@ -824,98 +824,101 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uc[0x1];
 	u8         rc[0x1];
 
-	u8         reserved_at_23f[0xa];
+	u8         reserved_at_240[0xa];
 	u8         uar_sz[0x6];
-	u8         reserved_at_24f[0x8];
+	u8         reserved_at_250[0x8];
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_at_260[0x1];
+	u8         reserved_at_261[0x1];
 	u8         pad_tx_eth_packet[0x1];
-	u8         reserved_at_262[0x8];
+	u8         reserved_at_263[0x8];
 	u8         log_bf_reg_size[0x5];
-	u8         reserved_at_26f[0x10];
+	u8         reserved_at_270[0x10];
 
-	u8         reserved_at_27f[0x10];
+	u8         reserved_at_280[0x10];
 	u8         max_wqe_sz_sq[0x10];
 
-	u8         reserved_at_29f[0x10];
+	u8         reserved_at_2a0[0x10];
 	u8         max_wqe_sz_rq[0x10];
 
-	u8         reserved_at_2bf[0x10];
+	u8         reserved_at_2c0[0x10];
 	u8         max_wqe_sz_sq_dc[0x10];
 
-	u8         reserved_at_2df[0x7];
+	u8         reserved_at_2e0[0x7];
 	u8         max_qp_mcg[0x19];
 
-	u8         reserved_at_2ff[0x18];
+	u8         reserved_at_300[0x18];
 	u8         log_max_mcg[0x8];
 
-	u8         reserved_at_31f[0x3];
+	u8         reserved_at_320[0x3];
 	u8         log_max_transport_domain[0x5];
-	u8         reserved_at_327[0x3];
+	u8         reserved_at_328[0x3];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_32f[0xb];
+	u8         reserved_at_330[0xb];
 	u8         log_max_xrcd[0x5];
 
-	u8         reserved_at_33f[0x20];
+	u8         reserved_at_340[0x20];
 
-	u8         reserved_at_35f[0x3];
+	u8         reserved_at_360[0x3];
 	u8         log_max_rq[0x5];
-	u8         reserved_at_367[0x3];
+	u8         reserved_at_368[0x3];
 	u8         log_max_sq[0x5];
-	u8         reserved_at_36f[0x3];
+	u8         reserved_at_370[0x3];
 	u8         log_max_tir[0x5];
-	u8         reserved_at_377[0x3];
+	u8         reserved_at_378[0x3];
 	u8         log_max_tis[0x5];
 
 	u8         basic_cyclic_rcv_wqe[0x1];
-	u8         reserved_at_380[0x2];
+	u8         reserved_at_381[0x2];
 	u8         log_max_rmp[0x5];
-	u8         reserved_at_387[0x3];
+	u8         reserved_at_388[0x3];
 	u8         log_max_rqt[0x5];
-	u8         reserved_at_38f[0x3];
+	u8         reserved_at_390[0x3];
 	u8         log_max_rqt_size[0x5];
-	u8         reserved_at_397[0x3];
+	u8         reserved_at_398[0x3];
 	u8         log_max_tis_per_sq[0x5];
 
-	u8         reserved_at_39f[0x3];
+	u8         reserved_at_3a0[0x3];
 	u8         log_max_stride_sz_rq[0x5];
-	u8         reserved_at_3a7[0x3];
+	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-	u8         reserved_at_3af[0x3];
+	u8         reserved_at_3b0[0x3];
 	u8         log_max_stride_sz_sq[0x5];
-	u8         reserved_at_3b7[0x3];
+	u8         reserved_at_3b8[0x3];
 	u8         log_min_stride_sz_sq[0x5];
 
-	u8         reserved_at_3bf[0x1b];
+	u8         reserved_at_3c0[0x1b];
 	u8         log_max_wq_sz[0x5];
 
 	u8         nic_vport_change_event[0x1];
-	u8         reserved_at_3e0[0xa];
+	u8         reserved_at_3e1[0xa];
 	u8         log_max_vlan_list[0x5];
-	u8         reserved_at_3ef[0x3];
+	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
-	u8         reserved_at_3f7[0x3];
+	u8         reserved_at_3f8[0x3];
 	u8         log_max_current_uc_list[0x5];
 
-	u8         reserved_at_3ff[0x80];
+	u8         reserved_at_400[0x80];
 
-	u8         reserved_at_47f[0x3];
+	u8         reserved_at_480[0x3];
 	u8         log_max_l2_table[0x5];
-	u8         reserved_at_487[0x8];
+	u8         reserved_at_488[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_at_49f[0x20];
+	u8         reserved_at_4a0[0x20];
 	u8         device_frequency_mhz[0x20];
 	u8         device_frequency_khz[0x20];
-	u8         reserved_at_4ff[0x5f];
+
+	u8         reserved_at_500[0x80];
+
+	u8         reserved_at_580[0x3f];
 	u8         cqe_zip[0x1];
 
 	u8         cqe_zip_timeout[0x10];
 	u8         cqe_zip_max_num[0x10];
 
-	u8         reserved_at_57f[0x220];
+	u8         reserved_at_5e0[0x220];
 };
 
 enum mlx5_flow_destination_type {
-- 
cgit v1.2.3


From 80835cba4b857485b068e1ce83512e896e6f001e Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 13 Apr 2016 19:11:04 +0300
Subject: net/mlx5: Update mlx5_ifc hardware features

Adding the needed mlx5_ifc hardware bits and structs
for the following features:

* Add vport to steering commands for SRIOV ACL support
* Add mlcr, pcmr and mcia registers for dump module EEPROM
* Add support for FCS, beacon led and disable_link bits to
  hca caps
* Add CQE period mode bit in CQ context for CQE based CQ
  moderation support
* Add umr SQ bit for fragmented memory registration
* Add needed bits and caps for Striding RQ support

In-order to avoid possible future conflicts between rdma and
net-next we added all expected updates to this file for this release.
If more changes will be submitted, we plan to do it only through
one of the subsystems, probably net-next.

All updated bits in this patch will be later used in
the up-coming submissions to net-next and rdma trees.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 146 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c300e7491d80..4ce4ea422a10 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -513,7 +513,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         max_lso_cap[0x5];
 	u8         reserved_at_10[0x4];
 	u8         rss_ind_tbl_cap[0x4];
-	u8         reserved_at_18[0x3];
+	u8         reg_umr_sq[0x1];
+	u8         scatter_fcs[0x1];
+	u8         reserved_at_1a[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
 	u8         reserved_at_1c[0x2];
 	u8         tunnel_statless_gre[0x1];
@@ -648,7 +650,7 @@ struct mlx5_ifc_vector_calc_cap_bits {
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
-	MLX5_WQ_TYPE_STRQ         = 0x2,
+	MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ = 0x2,
 };
 
 enum {
@@ -753,7 +755,11 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   early_vf_enable[0x1];
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_at_1af[0x6];
+	u8         reserved_at_1af[0x2];
+	u8         ports_check[0x1];
+	u8         reserved_at_1b2[0x1];
+	u8         disable_link_up[0x1];
+	u8         beacon_led[0x1];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
@@ -778,7 +784,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
-	u8         reserved_at_200[0x3];
+	u8         striding_rq[0x1];
+	u8         reserved_at_201[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -807,12 +814,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         block_lb_mc[0x1];
 	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
-	u8         reserved_at_22a[0x1];
+	u8         cq_period_start_from_cqe[0x1];
 	u8         cd[0x1];
 	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
-	u8         reserved_at_22f[0x1];
+	u8         umr_ptr_rlky[0x1];
 	u8	   imaicl[0x1];
 	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
@@ -913,10 +920,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_500[0x80];
 
 	u8         reserved_at_580[0x3f];
-	u8         cqe_zip[0x1];
+	u8         cqe_compression[0x1];
 
-	u8         cqe_zip_timeout[0x10];
-	u8         cqe_zip_max_num[0x10];
+	u8         cqe_compression_timeout[0x10];
+	u8         cqe_compression_max_num[0x10];
 
 	u8         reserved_at_5e0[0x220];
 };
@@ -1000,7 +1007,13 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_118[0x3];
 	u8         log_wq_sz[0x5];
 
-	u8         reserved_at_120[0x4e0];
+	u8         reserved_at_120[0x15];
+	u8         log_wqe_num_of_strides[0x3];
+	u8         two_byte_shift_en[0x1];
+	u8         reserved_at_139[0x4];
+	u8         log_wqe_stride_size[0x3];
+
+	u8         reserved_at_140[0x4c0];
 
 	struct mlx5_ifc_cmd_pas_bits pas[0];
 };
@@ -2199,7 +2212,8 @@ struct mlx5_ifc_sqc_bits {
 	u8         flush_in_error_en[0x1];
 	u8         reserved_at_4[0x4];
 	u8         state[0x4];
-	u8         reserved_at_c[0x14];
+	u8         reg_umr[0x1];
+	u8         reserved_at_d[0x13];
 
 	u8         reserved_at_20[0x8];
 	u8         user_index[0x18];
@@ -2247,7 +2261,8 @@ enum {
 
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
-	u8         reserved_at_1[0x2];
+	u8         reserved_at_1[0x1];
+	u8         scatter_fcs[0x1];
 	u8         vsd[0x1];
 	u8         mem_rq_type[0x4];
 	u8         state[0x4];
@@ -2604,6 +2619,11 @@ enum {
 	MLX5_CQC_ST_FIRED                                 = 0xa,
 };
 
+enum {
+	MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+};
+
 struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
 	u8         reserved_at_4[0x4];
@@ -2612,8 +2632,8 @@ struct mlx5_ifc_cqc_bits {
 	u8         reserved_at_c[0x1];
 	u8         scqe_break_moderation_en[0x1];
 	u8         oi[0x1];
-	u8         reserved_at_f[0x2];
-	u8         cqe_zip_en[0x1];
+	u8         cq_period_mode[0x2];
+	u8         cqe_comp_en[0x1];
 	u8         mini_cqe_res_format[0x2];
 	u8         st[0x4];
 	u8         reserved_at_18[0x8];
@@ -2987,7 +3007,11 @@ struct mlx5_ifc_set_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5181,7 +5205,11 @@ struct mlx5_ifc_destroy_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5208,7 +5236,11 @@ struct mlx5_ifc_destroy_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5349,7 +5381,11 @@ struct mlx5_ifc_delete_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5795,7 +5831,11 @@ struct mlx5_ifc_create_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5839,7 +5879,11 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -6372,6 +6416,17 @@ struct mlx5_ifc_ptys_reg_bits {
 	u8         reserved_at_1a0[0x60];
 };
 
+struct mlx5_ifc_mlcr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x20];
+
+	u8         beacon_duration[0x10];
+	u8         reserved_at_40[0x10];
+
+	u8         beacon_remain[0x10];
+};
+
 struct mlx5_ifc_ptas_reg_bits {
 	u8         reserved_at_0[0x20];
 
@@ -6781,6 +6836,16 @@ struct mlx5_ifc_pamp_reg_bits {
 	u8         index_data[18][0x10];
 };
 
+struct mlx5_ifc_pcmr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2e];
+	u8         fcs_cap[0x1];
+	u8         reserved_at_3f[0x1f];
+	u8         fcs_chk[0x1];
+	u8         reserved_at_5f[0x1];
+};
+
 struct mlx5_ifc_lane_2_module_mapping_bits {
 	u8         reserved_at_0[0x6];
 	u8         rx_lane[0x2];
@@ -7117,6 +7182,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pspa_reg_bits pspa_reg;
 	struct mlx5_ifc_ptas_reg_bits ptas_reg;
 	struct mlx5_ifc_ptys_reg_bits ptys_reg;
+	struct mlx5_ifc_mlcr_reg_bits mlcr_reg;
 	struct mlx5_ifc_pude_reg_bits pude_reg;
 	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
 	struct mlx5_ifc_slrg_reg_bits slrg_reg;
@@ -7150,7 +7216,11 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -7181,7 +7251,9 @@ struct mlx5_ifc_modify_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x20];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x10];
 	u8         modify_field_select[0x10];
@@ -7247,4 +7319,34 @@ struct mlx5_ifc_qtct_reg_bits {
 	u8         tclass[0x3];
 };
 
+struct mlx5_ifc_mcia_reg_bits {
+	u8         l[0x1];
+	u8         reserved_at_1[0x7];
+	u8         module[0x8];
+	u8         reserved_at_10[0x8];
+	u8         status[0x8];
+
+	u8         i2c_device_address[0x8];
+	u8         page_number[0x8];
+	u8         device_address[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         size[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         dword_0[0x20];
+	u8         dword_1[0x20];
+	u8         dword_2[0x20];
+	u8         dword_3[0x20];
+	u8         dword_4[0x20];
+	u8         dword_5[0x20];
+	u8         dword_6[0x20];
+	u8         dword_7[0x20];
+	u8         dword_8[0x20];
+	u8         dword_9[0x20];
+	u8         dword_10[0x20];
+	u8         dword_11[0x20];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 2ab71a02c56f8244ac611b5c6e6603c6fe83b966 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Mon, 25 Jan 2016 09:50:29 +0100
Subject: MIPS: BCM47xx: Move SPROM driver to drivers/firmware/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Broadcom ARM home routers store SPROM content in NVRAM just like MIPS
ones. To share SPROM code we need to move it out of arch/mips/ to some
common place. We already have bcm47xx_nvram in firmware path and SPROM
should fit there as well.
This driver is responsible for parsing SoC configuration data into a
struct shared between ssb and bcma buses.
This was tested with BCM4706 & BCM5357C0 (BCM47XX) and BCM4708A0
(ARCH_BCM_5301X).

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12210/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/Kconfig                         |   1 +
 arch/mips/bcm47xx/Makefile                |   2 +-
 arch/mips/bcm47xx/bcm47xx_private.h       |   3 -
 arch/mips/bcm47xx/setup.c                 |   2 +-
 arch/mips/bcm47xx/sprom.c                 | 724 -----------------------------
 drivers/firmware/broadcom/Kconfig         |  11 +
 drivers/firmware/broadcom/Makefile        |   1 +
 drivers/firmware/broadcom/bcm47xx_sprom.c | 737 ++++++++++++++++++++++++++++++
 include/linux/bcm47xx_sprom.h             |  24 +
 9 files changed, 776 insertions(+), 729 deletions(-)
 delete mode 100644 arch/mips/bcm47xx/sprom.c
 create mode 100644 drivers/firmware/broadcom/bcm47xx_sprom.c
 create mode 100644 include/linux/bcm47xx_sprom.h

(limited to 'include/linux')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 2352917a47a1..882e73c3334e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -197,6 +197,7 @@ config BCM47XX
 	select GPIOLIB
 	select LEDS_GPIO_REGISTER
 	select BCM47XX_NVRAM
+	select BCM47XX_SPROM
 	help
 	 Support for BCM47XX based boards
 
diff --git a/arch/mips/bcm47xx/Makefile b/arch/mips/bcm47xx/Makefile
index 66bea4ecf449..6d8615074075 100644
--- a/arch/mips/bcm47xx/Makefile
+++ b/arch/mips/bcm47xx/Makefile
@@ -3,5 +3,5 @@
 # under Linux.
 #
 
-obj-y				+= irq.o prom.o serial.o setup.o time.o sprom.o
+obj-y				+= irq.o prom.o serial.o setup.o time.o
 obj-y				+= board.o buttons.o leds.o workarounds.o
diff --git a/arch/mips/bcm47xx/bcm47xx_private.h b/arch/mips/bcm47xx/bcm47xx_private.h
index 41796befa9df..0367ac7286fe 100644
--- a/arch/mips/bcm47xx/bcm47xx_private.h
+++ b/arch/mips/bcm47xx/bcm47xx_private.h
@@ -10,9 +10,6 @@
 /* prom.c */
 void __init bcm47xx_prom_highmem_init(void);
 
-/* sprom.c */
-void bcm47xx_sprom_register_fallbacks(void);
-
 /* buttons.c */
 int __init bcm47xx_buttons_register(void);
 
diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c
index c807e32d6d81..6054d49e608e 100644
--- a/arch/mips/bcm47xx/setup.c
+++ b/arch/mips/bcm47xx/setup.c
@@ -28,6 +28,7 @@
 
 #include "bcm47xx_private.h"
 
+#include <linux/bcm47xx_sprom.h>
 #include <linux/export.h>
 #include <linux/types.h>
 #include <linux/ethtool.h>
@@ -151,7 +152,6 @@ void __init plat_mem_setup(void)
 		pr_info("Using bcma bus\n");
 #ifdef CONFIG_BCM47XX_BCMA
 		bcm47xx_bus_type = BCM47XX_BUS_TYPE_BCMA;
-		bcm47xx_sprom_register_fallbacks();
 		bcm47xx_register_bcma();
 		bcm47xx_set_system_type(bcm47xx_bus.bcma.bus.chipinfo.id);
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/mips/bcm47xx/sprom.c b/arch/mips/bcm47xx/sprom.c
deleted file mode 100644
index ca7ad131d057..000000000000
--- a/arch/mips/bcm47xx/sprom.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- *  Copyright (C) 2004 Florian Schirmer <jolt@tuxbox.org>
- *  Copyright (C) 2006 Felix Fietkau <nbd@openwrt.org>
- *  Copyright (C) 2006 Michael Buesch <m@bues.ch>
- *  Copyright (C) 2010 Waldemar Brodkorb <wbx@openadk.org>
- *  Copyright (C) 2010-2012 Hauke Mehrtens <hauke@hauke-m.de>
- *
- *  This program is free software; you can redistribute  it and/or modify it
- *  under  the terms of  the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the  License, or (at your
- *  option) any later version.
- *
- *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
- *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
- *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
- *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the  GNU General Public License along
- *  with this program; if not, write  to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <bcm47xx.h>
-#include <linux/if_ether.h>
-#include <linux/etherdevice.h>
-
-static void create_key(const char *prefix, const char *postfix,
-		       const char *name, char *buf, int len)
-{
-	if (prefix && postfix)
-		snprintf(buf, len, "%s%s%s", prefix, name, postfix);
-	else if (prefix)
-		snprintf(buf, len, "%s%s", prefix, name);
-	else if (postfix)
-		snprintf(buf, len, "%s%s", name, postfix);
-	else
-		snprintf(buf, len, "%s", name);
-}
-
-static int get_nvram_var(const char *prefix, const char *postfix,
-			 const char *name, char *buf, int len, bool fallback)
-{
-	char key[40];
-	int err;
-
-	create_key(prefix, postfix, name, key, sizeof(key));
-
-	err = bcm47xx_nvram_getenv(key, buf, len);
-	if (fallback && err == -ENOENT && prefix) {
-		create_key(NULL, postfix, name, key, sizeof(key));
-		err = bcm47xx_nvram_getenv(key, buf, len);
-	}
-	return err;
-}
-
-#define NVRAM_READ_VAL(type)						\
-static void nvram_read_ ## type(const char *prefix,			\
-				const char *postfix, const char *name,	\
-				type *val, type allset, bool fallback)	\
-{									\
-	char buf[100];							\
-	int err;							\
-	type var;							\
-									\
-	err = get_nvram_var(prefix, postfix, name, buf, sizeof(buf),	\
-			    fallback);					\
-	if (err < 0)							\
-		return;							\
-	err = kstrto ## type(strim(buf), 0, &var);			\
-	if (err) {							\
-		pr_warn("can not parse nvram name %s%s%s with value %s got %i\n",	\
-			prefix, name, postfix, buf, err);		\
-		return;							\
-	}								\
-	if (allset && var == allset)					\
-		return;							\
-	*val = var;							\
-}
-
-NVRAM_READ_VAL(u8)
-NVRAM_READ_VAL(s8)
-NVRAM_READ_VAL(u16)
-NVRAM_READ_VAL(u32)
-
-#undef NVRAM_READ_VAL
-
-static void nvram_read_u32_2(const char *prefix, const char *name,
-			     u16 *val_lo, u16 *val_hi, bool fallback)
-{
-	char buf[100];
-	int err;
-	u32 val;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-	err = kstrtou32(strim(buf), 0, &val);
-	if (err) {
-		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
-			prefix, name, buf, err);
-		return;
-	}
-	*val_lo = (val & 0x0000FFFFU);
-	*val_hi = (val & 0xFFFF0000U) >> 16;
-}
-
-static void nvram_read_leddc(const char *prefix, const char *name,
-			     u8 *leddc_on_time, u8 *leddc_off_time,
-			     bool fallback)
-{
-	char buf[100];
-	int err;
-	u32 val;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-	err = kstrtou32(strim(buf), 0, &val);
-	if (err) {
-		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
-			prefix, name, buf, err);
-		return;
-	}
-
-	if (val == 0xffff || val == 0xffffffff)
-		return;
-
-	*leddc_on_time = val & 0xff;
-	*leddc_off_time = (val >> 16) & 0xff;
-}
-
-static void bcm47xx_nvram_parse_macaddr(char *buf, u8 macaddr[6])
-{
-	if (strchr(buf, ':'))
-		sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &macaddr[0],
-			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
-			&macaddr[5]);
-	else if (strchr(buf, '-'))
-		sscanf(buf, "%hhx-%hhx-%hhx-%hhx-%hhx-%hhx", &macaddr[0],
-			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
-			&macaddr[5]);
-	else
-		pr_warn("Can not parse mac address: %s\n", buf);
-}
-
-static void nvram_read_macaddr(const char *prefix, const char *name,
-			       u8 val[6], bool fallback)
-{
-	char buf[100];
-	int err;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-
-	bcm47xx_nvram_parse_macaddr(buf, val);
-}
-
-static void nvram_read_alpha2(const char *prefix, const char *name,
-			     char val[2], bool fallback)
-{
-	char buf[10];
-	int err;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-	if (buf[0] == '0')
-		return;
-	if (strlen(buf) > 2) {
-		pr_warn("alpha2 is too long %s\n", buf);
-		return;
-	}
-	memcpy(val, buf, 2);
-}
-
-/* This is one-function-only macro, it uses local "sprom" variable! */
-#define ENTRY(_revmask, _type, _prefix, _name, _val, _allset, _fallback) \
-	if (_revmask & BIT(sprom->revision)) \
-		nvram_read_ ## _type(_prefix, NULL, _name, &sprom->_val, \
-				     _allset, _fallback)
-/*
- * Special version of filling function that can be safely called for any SPROM
- * revision. For every NVRAM to SPROM mapping it contains bitmask of revisions
- * for which the mapping is valid.
- * It obviously requires some hexadecimal/bitmasks knowledge, but allows
- * writing cleaner code (easy revisions handling).
- * Note that while SPROM revision 0 was never used, we still keep BIT(0)
- * reserved for it, just to keep numbering sane.
- */
-static void bcm47xx_sprom_fill_auto(struct ssb_sprom *sprom,
-				    const char *prefix, bool fallback)
-{
-	const char *pre = prefix;
-	bool fb = fallback;
-
-	/* Broadcom extracts it for rev 8+ but it was found on 2 and 4 too */
-	ENTRY(0xfffffffe, u16, pre, "devid", dev_id, 0, fallback);
-
-	ENTRY(0xfffffffe, u16, pre, "boardrev", board_rev, 0, true);
-	ENTRY(0xfffffffe, u32, pre, "boardflags", boardflags, 0, fb);
-	ENTRY(0xfffffff0, u32, pre, "boardflags2", boardflags2, 0, fb);
-	ENTRY(0xfffff800, u32, pre, "boardflags3", boardflags3, 0, fb);
-	ENTRY(0x00000002, u16, pre, "boardflags", boardflags_lo, 0, fb);
-	ENTRY(0xfffffffc, u16, pre, "boardtype", board_type, 0, true);
-	ENTRY(0xfffffffe, u16, pre, "boardnum", board_num, 0, fb);
-	ENTRY(0x00000002, u8, pre, "cc", country_code, 0, fb);
-	ENTRY(0xfffffff8, u8, pre, "regrev", regrev, 0, fb);
-
-	ENTRY(0xfffffffe, u8, pre, "ledbh0", gpio0, 0xff, fb);
-	ENTRY(0xfffffffe, u8, pre, "ledbh1", gpio1, 0xff, fb);
-	ENTRY(0xfffffffe, u8, pre, "ledbh2", gpio2, 0xff, fb);
-	ENTRY(0xfffffffe, u8, pre, "ledbh3", gpio3, 0xff, fb);
-
-	ENTRY(0x0000070e, u16, pre, "pa0b0", pa0b0, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa0b1", pa0b1, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa0b2", pa0b2, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa0itssit", itssi_bg, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa0maxpwr", maxpwr_bg, 0, fb);
-
-	ENTRY(0x0000070c, u8, pre, "opo", opo, 0, fb);
-	ENTRY(0xfffffffe, u8, pre, "aa2g", ant_available_bg, 0, fb);
-	ENTRY(0xfffffffe, u8, pre, "aa5g", ant_available_a, 0, fb);
-	ENTRY(0x000007fe, s8, pre, "ag0", antenna_gain.a0, 0, fb);
-	ENTRY(0x000007fe, s8, pre, "ag1", antenna_gain.a1, 0, fb);
-	ENTRY(0x000007f0, s8, pre, "ag2", antenna_gain.a2, 0, fb);
-	ENTRY(0x000007f0, s8, pre, "ag3", antenna_gain.a3, 0, fb);
-
-	ENTRY(0x0000070e, u16, pre, "pa1b0", pa1b0, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa1b1", pa1b1, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa1b2", pa1b2, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1lob0", pa1lob0, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1lob1", pa1lob1, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1lob2", pa1lob2, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1hib0", pa1hib0, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1hib1", pa1hib1, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1hib2", pa1hib2, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa1itssit", itssi_a, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa1maxpwr", maxpwr_a, 0, fb);
-	ENTRY(0x0000070c, u8, pre, "pa1lomaxpwr", maxpwr_al, 0, fb);
-	ENTRY(0x0000070c, u8, pre, "pa1himaxpwr", maxpwr_ah, 0, fb);
-
-	ENTRY(0x00000708, u8, pre, "bxa2g", bxa2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssisav2g", rssisav2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismc2g", rssismc2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismf2g", rssismf2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "bxa5g", bxa5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssisav5g", rssisav5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismc5g", rssismc5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismf5g", rssismf5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri2g", tri2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri5g", tri5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri5gl", tri5gl, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri5gh", tri5gh, 0, fb);
-	ENTRY(0x00000708, s8, pre, "rxpo2g", rxpo2g, 0, fb);
-	ENTRY(0x00000708, s8, pre, "rxpo5g", rxpo5g, 0, fb);
-	ENTRY(0xfffffff0, u8, pre, "txchain", txchain, 0xf, fb);
-	ENTRY(0xfffffff0, u8, pre, "rxchain", rxchain, 0xf, fb);
-	ENTRY(0xfffffff0, u8, pre, "antswitch", antswitch, 0xff, fb);
-	ENTRY(0x00000700, u8, pre, "tssipos2g", fem.ghz2.tssipos, 0, fb);
-	ENTRY(0x00000700, u8, pre, "extpagain2g", fem.ghz2.extpa_gain, 0, fb);
-	ENTRY(0x00000700, u8, pre, "pdetrange2g", fem.ghz2.pdet_range, 0, fb);
-	ENTRY(0x00000700, u8, pre, "triso2g", fem.ghz2.tr_iso, 0, fb);
-	ENTRY(0x00000700, u8, pre, "antswctl2g", fem.ghz2.antswlut, 0, fb);
-	ENTRY(0x00000700, u8, pre, "tssipos5g", fem.ghz5.tssipos, 0, fb);
-	ENTRY(0x00000700, u8, pre, "extpagain5g", fem.ghz5.extpa_gain, 0, fb);
-	ENTRY(0x00000700, u8, pre, "pdetrange5g", fem.ghz5.pdet_range, 0, fb);
-	ENTRY(0x00000700, u8, pre, "triso5g", fem.ghz5.tr_iso, 0, fb);
-	ENTRY(0x00000700, u8, pre, "antswctl5g", fem.ghz5.antswlut, 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga0", txpid2g[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga1", txpid2g[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga2", txpid2g[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga3", txpid2g[3], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga0", txpid5g[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga1", txpid5g[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga2", txpid5g[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga3", txpid5g[3], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla0", txpid5gl[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla1", txpid5gl[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla2", txpid5gl[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla3", txpid5gl[3], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha0", txpid5gh[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha1", txpid5gh[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha2", txpid5gh[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha3", txpid5gh[3], 0, fb);
-
-	ENTRY(0xffffff00, u8, pre, "tempthresh", tempthresh, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempoffset", tempoffset, 0, fb);
-	ENTRY(0xffffff00, u16, pre, "rawtempsense", rawtempsense, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "measpower", measpower, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempsense_slope", tempsense_slope, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempcorrx", tempcorrx, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempsense_option", tempsense_option, 0, fb);
-	ENTRY(0x00000700, u8, pre, "freqoffset_corr", freqoffset_corr, 0, fb);
-	ENTRY(0x00000700, u8, pre, "iqcal_swp_dis", iqcal_swp_dis, 0, fb);
-	ENTRY(0x00000700, u8, pre, "hw_iqcal_en", hw_iqcal_en, 0, fb);
-	ENTRY(0x00000700, u8, pre, "elna2g", elna2g, 0, fb);
-	ENTRY(0x00000700, u8, pre, "elna5g", elna5g, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "phycal_tempdelta", phycal_tempdelta, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "temps_period", temps_period, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "temps_hysteresis", temps_hysteresis, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "measpower1", measpower1, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "measpower2", measpower2, 0, fb);
-
-	ENTRY(0x000001f0, u16, pre, "cck2gpo", cck2gpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm2gpo", ofdm2gpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm5gpo", ofdm5gpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm5glpo", ofdm5glpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm5ghpo", ofdm5ghpo, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo0", mcs2gpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo1", mcs2gpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo2", mcs2gpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo3", mcs2gpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo4", mcs2gpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo5", mcs2gpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo6", mcs2gpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo7", mcs2gpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo0", mcs5gpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo1", mcs5gpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo2", mcs5gpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo3", mcs5gpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo4", mcs5gpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo5", mcs5gpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo6", mcs5gpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo7", mcs5gpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo0", mcs5glpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo1", mcs5glpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo2", mcs5glpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo3", mcs5glpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo4", mcs5glpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo5", mcs5glpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo6", mcs5glpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo7", mcs5glpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo0", mcs5ghpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo1", mcs5ghpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo2", mcs5ghpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo3", mcs5ghpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo4", mcs5ghpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo5", mcs5ghpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo6", mcs5ghpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo7", mcs5ghpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "cddpo", cddpo, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "stbcpo", stbcpo, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "bw40po", bw40po, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "bwduppo", bwduppo, 0, fb);
-
-	ENTRY(0xfffffe00, u16, pre, "cckbw202gpo", cckbw202gpo, 0, fb);
-	ENTRY(0xfffffe00, u16, pre, "cckbw20ul2gpo", cckbw20ul2gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw202gpo", legofdmbw202gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul2gpo", legofdmbw20ul2gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw205glpo", legofdmbw205glpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5glpo", legofdmbw20ul5glpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw205gmpo", legofdmbw205gmpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5gmpo", legofdmbw20ul5gmpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw205ghpo", legofdmbw205ghpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5ghpo", legofdmbw20ul5ghpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw202gpo", mcsbw202gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul2gpo", mcsbw20ul2gpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw402gpo", mcsbw402gpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw205glpo", mcsbw205glpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul5glpo", mcsbw20ul5glpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw405glpo", mcsbw405glpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw205gmpo", mcsbw205gmpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul5gmpo", mcsbw20ul5gmpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw405gmpo", mcsbw405gmpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw205ghpo", mcsbw205ghpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul5ghpo", mcsbw20ul5ghpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw405ghpo", mcsbw405ghpo, 0, fb);
-	ENTRY(0x00000600, u16, pre, "mcs32po", mcs32po, 0, fb);
-	ENTRY(0x00000600, u16, pre, "legofdm40duppo", legofdm40duppo, 0, fb);
-	ENTRY(0x00000700, u8, pre, "pcieingress_war", pcieingress_war, 0, fb);
-
-	/* TODO: rev 11 support */
-	ENTRY(0x00000700, u8, pre, "rxgainerr2ga0", rxgainerr2ga[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr2ga1", rxgainerr2ga[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr2ga2", rxgainerr2ga[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gla0", rxgainerr5gla[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gla1", rxgainerr5gla[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gla2", rxgainerr5gla[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gma0", rxgainerr5gma[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gma1", rxgainerr5gma[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gma2", rxgainerr5gma[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gha0", rxgainerr5gha[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gha1", rxgainerr5gha[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gha2", rxgainerr5gha[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gua0", rxgainerr5gua[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gua1", rxgainerr5gua[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gua2", rxgainerr5gua[2], 0, fb);
-
-	ENTRY(0xfffffe00, u8, pre, "sar2g", sar2g, 0, fb);
-	ENTRY(0xfffffe00, u8, pre, "sar5g", sar5g, 0, fb);
-
-	/* TODO: rev 11 support */
-	ENTRY(0x00000700, u8, pre, "noiselvl2ga0", noiselvl2ga[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl2ga1", noiselvl2ga[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl2ga2", noiselvl2ga[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gla0", noiselvl5gla[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gla1", noiselvl5gla[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gla2", noiselvl5gla[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gma0", noiselvl5gma[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gma1", noiselvl5gma[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gma2", noiselvl5gma[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gha0", noiselvl5gha[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gha1", noiselvl5gha[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gha2", noiselvl5gha[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gua0", noiselvl5gua[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gua1", noiselvl5gua[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gua2", noiselvl5gua[2], 0, fb);
-}
-#undef ENTRY /* It's specififc, uses local variable, don't use it (again). */
-
-static void bcm47xx_fill_sprom_path_r4589(struct ssb_sprom *sprom,
-					  const char *prefix, bool fallback)
-{
-	char postfix[2];
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
-		struct ssb_sprom_core_pwr_info *pwr_info;
-
-		pwr_info = &sprom->core_pwr_info[i];
-
-		snprintf(postfix, sizeof(postfix), "%i", i);
-		nvram_read_u8(prefix, postfix, "maxp2ga",
-			      &pwr_info->maxpwr_2g, 0, fallback);
-		nvram_read_u8(prefix, postfix, "itt2ga",
-			      &pwr_info->itssi_2g, 0, fallback);
-		nvram_read_u8(prefix, postfix, "itt5ga",
-			      &pwr_info->itssi_5g, 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa2gw0a",
-			       &pwr_info->pa_2g[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa2gw1a",
-			       &pwr_info->pa_2g[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa2gw2a",
-			       &pwr_info->pa_2g[2], 0, fallback);
-		nvram_read_u8(prefix, postfix, "maxp5ga",
-			      &pwr_info->maxpwr_5g, 0, fallback);
-		nvram_read_u8(prefix, postfix, "maxp5gha",
-			      &pwr_info->maxpwr_5gh, 0, fallback);
-		nvram_read_u8(prefix, postfix, "maxp5gla",
-			      &pwr_info->maxpwr_5gl, 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw0a",
-			       &pwr_info->pa_5g[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw1a",
-			       &pwr_info->pa_5g[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw2a",
-			       &pwr_info->pa_5g[2], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw0a",
-			       &pwr_info->pa_5gl[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw1a",
-			       &pwr_info->pa_5gl[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw2a",
-			       &pwr_info->pa_5gl[2], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw0a",
-			       &pwr_info->pa_5gh[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw1a",
-			       &pwr_info->pa_5gh[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw2a",
-			       &pwr_info->pa_5gh[2], 0, fallback);
-	}
-}
-
-static void bcm47xx_fill_sprom_path_r45(struct ssb_sprom *sprom,
-					const char *prefix, bool fallback)
-{
-	char postfix[2];
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
-		struct ssb_sprom_core_pwr_info *pwr_info;
-
-		pwr_info = &sprom->core_pwr_info[i];
-
-		snprintf(postfix, sizeof(postfix), "%i", i);
-		nvram_read_u16(prefix, postfix, "pa2gw3a",
-			       &pwr_info->pa_2g[3], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw3a",
-			       &pwr_info->pa_5g[3], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw3a",
-			       &pwr_info->pa_5gl[3], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw3a",
-			       &pwr_info->pa_5gh[3], 0, fallback);
-	}
-}
-
-static bool bcm47xx_is_valid_mac(u8 *mac)
-{
-	return mac && !(mac[0] == 0x00 && mac[1] == 0x90 && mac[2] == 0x4c);
-}
-
-static int bcm47xx_increase_mac_addr(u8 *mac, u8 num)
-{
-	u8 *oui = mac + ETH_ALEN/2 - 1;
-	u8 *p = mac + ETH_ALEN - 1;
-
-	do {
-		(*p) += num;
-		if (*p > num)
-			break;
-		p--;
-		num = 1;
-	} while (p != oui);
-
-	if (p == oui) {
-		pr_err("unable to fetch mac address\n");
-		return -ENOENT;
-	}
-	return 0;
-}
-
-static int mac_addr_used = 2;
-
-static void bcm47xx_fill_sprom_ethernet(struct ssb_sprom *sprom,
-					const char *prefix, bool fallback)
-{
-	bool fb = fallback;
-
-	nvram_read_macaddr(prefix, "et0macaddr", sprom->et0mac, fallback);
-	nvram_read_u8(prefix, NULL, "et0mdcport", &sprom->et0mdcport, 0,
-		      fallback);
-	nvram_read_u8(prefix, NULL, "et0phyaddr", &sprom->et0phyaddr, 0,
-		      fallback);
-
-	nvram_read_macaddr(prefix, "et1macaddr", sprom->et1mac, fallback);
-	nvram_read_u8(prefix, NULL, "et1mdcport", &sprom->et1mdcport, 0,
-		      fallback);
-	nvram_read_u8(prefix, NULL, "et1phyaddr", &sprom->et1phyaddr, 0,
-		      fallback);
-
-	nvram_read_macaddr(prefix, "et2macaddr", sprom->et2mac, fb);
-	nvram_read_u8(prefix, NULL, "et2mdcport", &sprom->et2mdcport, 0, fb);
-	nvram_read_u8(prefix, NULL, "et2phyaddr", &sprom->et2phyaddr, 0, fb);
-
-	nvram_read_macaddr(prefix, "macaddr", sprom->il0mac, fallback);
-	nvram_read_macaddr(prefix, "il0macaddr", sprom->il0mac, fallback);
-
-	/* The address prefix 00:90:4C is used by Broadcom in their initial
-	 * configuration. When a mac address with the prefix 00:90:4C is used
-	 * all devices from the same series are sharing the same mac address.
-	 * To prevent mac address collisions we replace them with a mac address
-	 * based on the base address.
-	 */
-	if (!bcm47xx_is_valid_mac(sprom->il0mac)) {
-		u8 mac[6];
-
-		nvram_read_macaddr(NULL, "et0macaddr", mac, false);
-		if (bcm47xx_is_valid_mac(mac)) {
-			int err = bcm47xx_increase_mac_addr(mac, mac_addr_used);
-
-			if (!err) {
-				ether_addr_copy(sprom->il0mac, mac);
-				mac_addr_used++;
-			}
-		}
-	}
-}
-
-static void bcm47xx_fill_board_data(struct ssb_sprom *sprom, const char *prefix,
-				    bool fallback)
-{
-	nvram_read_u32_2(prefix, "boardflags", &sprom->boardflags_lo,
-			 &sprom->boardflags_hi, fallback);
-	nvram_read_u32_2(prefix, "boardflags2", &sprom->boardflags2_lo,
-			 &sprom->boardflags2_hi, fallback);
-}
-
-void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix,
-			bool fallback)
-{
-	bcm47xx_fill_sprom_ethernet(sprom, prefix, fallback);
-	bcm47xx_fill_board_data(sprom, prefix, fallback);
-
-	nvram_read_u8(prefix, NULL, "sromrev", &sprom->revision, 0, fallback);
-
-	/* Entries requiring custom functions */
-	nvram_read_alpha2(prefix, "ccode", sprom->alpha2, fallback);
-	if (sprom->revision >= 3)
-		nvram_read_leddc(prefix, "leddc", &sprom->leddc_on_time,
-				 &sprom->leddc_off_time, fallback);
-
-	switch (sprom->revision) {
-	case 4:
-	case 5:
-		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
-		bcm47xx_fill_sprom_path_r45(sprom, prefix, fallback);
-		break;
-	case 8:
-	case 9:
-		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
-		break;
-	}
-
-	bcm47xx_sprom_fill_auto(sprom, prefix, fallback);
-}
-
-#if defined(CONFIG_BCM47XX_SSB)
-static int bcm47xx_get_sprom_ssb(struct ssb_bus *bus, struct ssb_sprom *out)
-{
-	char prefix[10];
-
-	switch (bus->bustype) {
-	case SSB_BUSTYPE_SSB:
-		bcm47xx_fill_sprom(out, NULL, false);
-		return 0;
-	case SSB_BUSTYPE_PCI:
-		memset(out, 0, sizeof(struct ssb_sprom));
-		snprintf(prefix, sizeof(prefix), "pci/%u/%u/",
-			 bus->host_pci->bus->number + 1,
-			 PCI_SLOT(bus->host_pci->devfn));
-		bcm47xx_fill_sprom(out, prefix, false);
-		return 0;
-	default:
-		pr_warn("Unable to fill SPROM for given bustype.\n");
-		return -EINVAL;
-	}
-}
-#endif
-
-#if defined(CONFIG_BCM47XX_BCMA)
-/*
- * Having many NVRAM entries for PCI devices led to repeating prefixes like
- * pci/1/1/ all the time and wasting flash space. So at some point Broadcom
- * decided to introduce prefixes like 0: 1: 2: etc.
- * If we find e.g. devpath0=pci/2/1 or devpath0=pci/2/1/ we should use 0:
- * instead of pci/2/1/.
- */
-static void bcm47xx_sprom_apply_prefix_alias(char *prefix, size_t prefix_size)
-{
-	size_t prefix_len = strlen(prefix);
-	size_t short_len = prefix_len - 1;
-	char nvram_var[10];
-	char buf[20];
-	int i;
-
-	/* Passed prefix has to end with a slash */
-	if (prefix_len <= 0 || prefix[prefix_len - 1] != '/')
-		return;
-
-	for (i = 0; i < 3; i++) {
-		if (snprintf(nvram_var, sizeof(nvram_var), "devpath%d", i) <= 0)
-			continue;
-		if (bcm47xx_nvram_getenv(nvram_var, buf, sizeof(buf)) < 0)
-			continue;
-		if (!strcmp(buf, prefix) ||
-		    (short_len && strlen(buf) == short_len && !strncmp(buf, prefix, short_len))) {
-			snprintf(prefix, prefix_size, "%d:", i);
-			return;
-		}
-	}
-}
-
-static int bcm47xx_get_sprom_bcma(struct bcma_bus *bus, struct ssb_sprom *out)
-{
-	struct bcma_boardinfo *binfo = &bus->boardinfo;
-	struct bcma_device *core;
-	char buf[10];
-	char *prefix;
-	bool fallback = false;
-
-	switch (bus->hosttype) {
-	case BCMA_HOSTTYPE_PCI:
-		memset(out, 0, sizeof(struct ssb_sprom));
-		/* On BCM47XX all PCI buses share the same domain */
-		if (config_enabled(CONFIG_BCM47XX))
-			snprintf(buf, sizeof(buf), "pci/%u/%u/",
-				 bus->host_pci->bus->number + 1,
-				 PCI_SLOT(bus->host_pci->devfn));
-		else
-			snprintf(buf, sizeof(buf), "pci/%u/%u/",
-				 pci_domain_nr(bus->host_pci->bus) + 1,
-				 bus->host_pci->bus->number);
-		bcm47xx_sprom_apply_prefix_alias(buf, sizeof(buf));
-		prefix = buf;
-		break;
-	case BCMA_HOSTTYPE_SOC:
-		memset(out, 0, sizeof(struct ssb_sprom));
-		core = bcma_find_core(bus, BCMA_CORE_80211);
-		if (core) {
-			snprintf(buf, sizeof(buf), "sb/%u/",
-				 core->core_index);
-			prefix = buf;
-			fallback = true;
-		} else {
-			prefix = NULL;
-		}
-		break;
-	default:
-		pr_warn("Unable to fill SPROM for given bustype.\n");
-		return -EINVAL;
-	}
-
-	nvram_read_u16(prefix, NULL, "boardvendor", &binfo->vendor, 0, true);
-	if (!binfo->vendor)
-		binfo->vendor = SSB_BOARDVENDOR_BCM;
-	nvram_read_u16(prefix, NULL, "boardtype", &binfo->type, 0, true);
-
-	bcm47xx_fill_sprom(out, prefix, fallback);
-
-	return 0;
-}
-#endif
-
-/*
- * On bcm47xx we need to register SPROM fallback handler very early, so we can't
- * use anything like platform device / driver for this.
- */
-void bcm47xx_sprom_register_fallbacks(void)
-{
-#if defined(CONFIG_BCM47XX_SSB)
-	if (ssb_arch_register_fallback_sprom(&bcm47xx_get_sprom_ssb))
-		pr_warn("Failed to register ssb SPROM handler\n");
-#endif
-
-#if defined(CONFIG_BCM47XX_BCMA)
-	if (bcma_arch_register_fallback_sprom(&bcm47xx_get_sprom_bcma))
-		pr_warn("Failed to register bcma SPROM handler\n");
-#endif
-}
diff --git a/drivers/firmware/broadcom/Kconfig b/drivers/firmware/broadcom/Kconfig
index 6bed119930dd..3c7e5b741e37 100644
--- a/drivers/firmware/broadcom/Kconfig
+++ b/drivers/firmware/broadcom/Kconfig
@@ -9,3 +9,14 @@ config BCM47XX_NVRAM
 	  This driver provides an easy way to get value of requested parameter.
 	  It simply reads content of NVRAM and parses it. It doesn't control any
 	  hardware part itself.
+
+config BCM47XX_SPROM
+	bool "Broadcom SPROM driver"
+	depends on BCM47XX_NVRAM
+	help
+	  Broadcom devices store configuration data in SPROM. Accessing it is
+	  specific to the bus host type, e.g. PCI(e) devices have it mapped in
+	  a PCI BAR.
+	  In case of SoC devices SPROM content is stored on a flash used by
+	  bootloader firmware CFE. This driver provides method to ssb and bcma
+	  drivers to read SPROM on SoC.
diff --git a/drivers/firmware/broadcom/Makefile b/drivers/firmware/broadcom/Makefile
index d0e683583cd6..f93efc479b8b 100644
--- a/drivers/firmware/broadcom/Makefile
+++ b/drivers/firmware/broadcom/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_BCM47XX_NVRAM)		+= bcm47xx_nvram.o
+obj-$(CONFIG_BCM47XX_SPROM)		+= bcm47xx_sprom.o
diff --git a/drivers/firmware/broadcom/bcm47xx_sprom.c b/drivers/firmware/broadcom/bcm47xx_sprom.c
new file mode 100644
index 000000000000..b6eb875d4af3
--- /dev/null
+++ b/drivers/firmware/broadcom/bcm47xx_sprom.c
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (C) 2004 Florian Schirmer <jolt@tuxbox.org>
+ *  Copyright (C) 2006 Felix Fietkau <nbd@openwrt.org>
+ *  Copyright (C) 2006 Michael Buesch <m@bues.ch>
+ *  Copyright (C) 2010 Waldemar Brodkorb <wbx@openadk.org>
+ *  Copyright (C) 2010-2012 Hauke Mehrtens <hauke@hauke-m.de>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
+ *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
+ *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
+ *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the  GNU General Public License along
+ *  with this program; if not, write  to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bcm47xx_nvram.h>
+#include <linux/bcma/bcma.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/ssb/ssb.h>
+
+static void create_key(const char *prefix, const char *postfix,
+		       const char *name, char *buf, int len)
+{
+	if (prefix && postfix)
+		snprintf(buf, len, "%s%s%s", prefix, name, postfix);
+	else if (prefix)
+		snprintf(buf, len, "%s%s", prefix, name);
+	else if (postfix)
+		snprintf(buf, len, "%s%s", name, postfix);
+	else
+		snprintf(buf, len, "%s", name);
+}
+
+static int get_nvram_var(const char *prefix, const char *postfix,
+			 const char *name, char *buf, int len, bool fallback)
+{
+	char key[40];
+	int err;
+
+	create_key(prefix, postfix, name, key, sizeof(key));
+
+	err = bcm47xx_nvram_getenv(key, buf, len);
+	if (fallback && err == -ENOENT && prefix) {
+		create_key(NULL, postfix, name, key, sizeof(key));
+		err = bcm47xx_nvram_getenv(key, buf, len);
+	}
+	return err;
+}
+
+#define NVRAM_READ_VAL(type)						\
+static void nvram_read_ ## type(const char *prefix,			\
+				const char *postfix, const char *name,	\
+				type *val, type allset, bool fallback)	\
+{									\
+	char buf[100];							\
+	int err;							\
+	type var;							\
+									\
+	err = get_nvram_var(prefix, postfix, name, buf, sizeof(buf),	\
+			    fallback);					\
+	if (err < 0)							\
+		return;							\
+	err = kstrto ## type(strim(buf), 0, &var);			\
+	if (err) {							\
+		pr_warn("can not parse nvram name %s%s%s with value %s got %i\n",	\
+			prefix, name, postfix, buf, err);		\
+		return;							\
+	}								\
+	if (allset && var == allset)					\
+		return;							\
+	*val = var;							\
+}
+
+NVRAM_READ_VAL(u8)
+NVRAM_READ_VAL(s8)
+NVRAM_READ_VAL(u16)
+NVRAM_READ_VAL(u32)
+
+#undef NVRAM_READ_VAL
+
+static void nvram_read_u32_2(const char *prefix, const char *name,
+			     u16 *val_lo, u16 *val_hi, bool fallback)
+{
+	char buf[100];
+	int err;
+	u32 val;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+	err = kstrtou32(strim(buf), 0, &val);
+	if (err) {
+		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
+			prefix, name, buf, err);
+		return;
+	}
+	*val_lo = (val & 0x0000FFFFU);
+	*val_hi = (val & 0xFFFF0000U) >> 16;
+}
+
+static void nvram_read_leddc(const char *prefix, const char *name,
+			     u8 *leddc_on_time, u8 *leddc_off_time,
+			     bool fallback)
+{
+	char buf[100];
+	int err;
+	u32 val;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+	err = kstrtou32(strim(buf), 0, &val);
+	if (err) {
+		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
+			prefix, name, buf, err);
+		return;
+	}
+
+	if (val == 0xffff || val == 0xffffffff)
+		return;
+
+	*leddc_on_time = val & 0xff;
+	*leddc_off_time = (val >> 16) & 0xff;
+}
+
+static void bcm47xx_nvram_parse_macaddr(char *buf, u8 macaddr[6])
+{
+	if (strchr(buf, ':'))
+		sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &macaddr[0],
+			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
+			&macaddr[5]);
+	else if (strchr(buf, '-'))
+		sscanf(buf, "%hhx-%hhx-%hhx-%hhx-%hhx-%hhx", &macaddr[0],
+			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
+			&macaddr[5]);
+	else
+		pr_warn("Can not parse mac address: %s\n", buf);
+}
+
+static void nvram_read_macaddr(const char *prefix, const char *name,
+			       u8 val[6], bool fallback)
+{
+	char buf[100];
+	int err;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+
+	bcm47xx_nvram_parse_macaddr(buf, val);
+}
+
+static void nvram_read_alpha2(const char *prefix, const char *name,
+			     char val[2], bool fallback)
+{
+	char buf[10];
+	int err;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+	if (buf[0] == '0')
+		return;
+	if (strlen(buf) > 2) {
+		pr_warn("alpha2 is too long %s\n", buf);
+		return;
+	}
+	memcpy(val, buf, 2);
+}
+
+/* This is one-function-only macro, it uses local "sprom" variable! */
+#define ENTRY(_revmask, _type, _prefix, _name, _val, _allset, _fallback) \
+	if (_revmask & BIT(sprom->revision)) \
+		nvram_read_ ## _type(_prefix, NULL, _name, &sprom->_val, \
+				     _allset, _fallback)
+/*
+ * Special version of filling function that can be safely called for any SPROM
+ * revision. For every NVRAM to SPROM mapping it contains bitmask of revisions
+ * for which the mapping is valid.
+ * It obviously requires some hexadecimal/bitmasks knowledge, but allows
+ * writing cleaner code (easy revisions handling).
+ * Note that while SPROM revision 0 was never used, we still keep BIT(0)
+ * reserved for it, just to keep numbering sane.
+ */
+static void bcm47xx_sprom_fill_auto(struct ssb_sprom *sprom,
+				    const char *prefix, bool fallback)
+{
+	const char *pre = prefix;
+	bool fb = fallback;
+
+	/* Broadcom extracts it for rev 8+ but it was found on 2 and 4 too */
+	ENTRY(0xfffffffe, u16, pre, "devid", dev_id, 0, fallback);
+
+	ENTRY(0xfffffffe, u16, pre, "boardrev", board_rev, 0, true);
+	ENTRY(0xfffffffe, u32, pre, "boardflags", boardflags, 0, fb);
+	ENTRY(0xfffffff0, u32, pre, "boardflags2", boardflags2, 0, fb);
+	ENTRY(0xfffff800, u32, pre, "boardflags3", boardflags3, 0, fb);
+	ENTRY(0x00000002, u16, pre, "boardflags", boardflags_lo, 0, fb);
+	ENTRY(0xfffffffc, u16, pre, "boardtype", board_type, 0, true);
+	ENTRY(0xfffffffe, u16, pre, "boardnum", board_num, 0, fb);
+	ENTRY(0x00000002, u8, pre, "cc", country_code, 0, fb);
+	ENTRY(0xfffffff8, u8, pre, "regrev", regrev, 0, fb);
+
+	ENTRY(0xfffffffe, u8, pre, "ledbh0", gpio0, 0xff, fb);
+	ENTRY(0xfffffffe, u8, pre, "ledbh1", gpio1, 0xff, fb);
+	ENTRY(0xfffffffe, u8, pre, "ledbh2", gpio2, 0xff, fb);
+	ENTRY(0xfffffffe, u8, pre, "ledbh3", gpio3, 0xff, fb);
+
+	ENTRY(0x0000070e, u16, pre, "pa0b0", pa0b0, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa0b1", pa0b1, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa0b2", pa0b2, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa0itssit", itssi_bg, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa0maxpwr", maxpwr_bg, 0, fb);
+
+	ENTRY(0x0000070c, u8, pre, "opo", opo, 0, fb);
+	ENTRY(0xfffffffe, u8, pre, "aa2g", ant_available_bg, 0, fb);
+	ENTRY(0xfffffffe, u8, pre, "aa5g", ant_available_a, 0, fb);
+	ENTRY(0x000007fe, s8, pre, "ag0", antenna_gain.a0, 0, fb);
+	ENTRY(0x000007fe, s8, pre, "ag1", antenna_gain.a1, 0, fb);
+	ENTRY(0x000007f0, s8, pre, "ag2", antenna_gain.a2, 0, fb);
+	ENTRY(0x000007f0, s8, pre, "ag3", antenna_gain.a3, 0, fb);
+
+	ENTRY(0x0000070e, u16, pre, "pa1b0", pa1b0, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa1b1", pa1b1, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa1b2", pa1b2, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1lob0", pa1lob0, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1lob1", pa1lob1, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1lob2", pa1lob2, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1hib0", pa1hib0, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1hib1", pa1hib1, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1hib2", pa1hib2, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa1itssit", itssi_a, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa1maxpwr", maxpwr_a, 0, fb);
+	ENTRY(0x0000070c, u8, pre, "pa1lomaxpwr", maxpwr_al, 0, fb);
+	ENTRY(0x0000070c, u8, pre, "pa1himaxpwr", maxpwr_ah, 0, fb);
+
+	ENTRY(0x00000708, u8, pre, "bxa2g", bxa2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssisav2g", rssisav2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismc2g", rssismc2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismf2g", rssismf2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "bxa5g", bxa5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssisav5g", rssisav5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismc5g", rssismc5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismf5g", rssismf5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri2g", tri2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri5g", tri5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri5gl", tri5gl, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri5gh", tri5gh, 0, fb);
+	ENTRY(0x00000708, s8, pre, "rxpo2g", rxpo2g, 0, fb);
+	ENTRY(0x00000708, s8, pre, "rxpo5g", rxpo5g, 0, fb);
+	ENTRY(0xfffffff0, u8, pre, "txchain", txchain, 0xf, fb);
+	ENTRY(0xfffffff0, u8, pre, "rxchain", rxchain, 0xf, fb);
+	ENTRY(0xfffffff0, u8, pre, "antswitch", antswitch, 0xff, fb);
+	ENTRY(0x00000700, u8, pre, "tssipos2g", fem.ghz2.tssipos, 0, fb);
+	ENTRY(0x00000700, u8, pre, "extpagain2g", fem.ghz2.extpa_gain, 0, fb);
+	ENTRY(0x00000700, u8, pre, "pdetrange2g", fem.ghz2.pdet_range, 0, fb);
+	ENTRY(0x00000700, u8, pre, "triso2g", fem.ghz2.tr_iso, 0, fb);
+	ENTRY(0x00000700, u8, pre, "antswctl2g", fem.ghz2.antswlut, 0, fb);
+	ENTRY(0x00000700, u8, pre, "tssipos5g", fem.ghz5.tssipos, 0, fb);
+	ENTRY(0x00000700, u8, pre, "extpagain5g", fem.ghz5.extpa_gain, 0, fb);
+	ENTRY(0x00000700, u8, pre, "pdetrange5g", fem.ghz5.pdet_range, 0, fb);
+	ENTRY(0x00000700, u8, pre, "triso5g", fem.ghz5.tr_iso, 0, fb);
+	ENTRY(0x00000700, u8, pre, "antswctl5g", fem.ghz5.antswlut, 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga0", txpid2g[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga1", txpid2g[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga2", txpid2g[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga3", txpid2g[3], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga0", txpid5g[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga1", txpid5g[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga2", txpid5g[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga3", txpid5g[3], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla0", txpid5gl[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla1", txpid5gl[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla2", txpid5gl[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla3", txpid5gl[3], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha0", txpid5gh[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha1", txpid5gh[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha2", txpid5gh[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha3", txpid5gh[3], 0, fb);
+
+	ENTRY(0xffffff00, u8, pre, "tempthresh", tempthresh, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempoffset", tempoffset, 0, fb);
+	ENTRY(0xffffff00, u16, pre, "rawtempsense", rawtempsense, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "measpower", measpower, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempsense_slope", tempsense_slope, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempcorrx", tempcorrx, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempsense_option", tempsense_option, 0, fb);
+	ENTRY(0x00000700, u8, pre, "freqoffset_corr", freqoffset_corr, 0, fb);
+	ENTRY(0x00000700, u8, pre, "iqcal_swp_dis", iqcal_swp_dis, 0, fb);
+	ENTRY(0x00000700, u8, pre, "hw_iqcal_en", hw_iqcal_en, 0, fb);
+	ENTRY(0x00000700, u8, pre, "elna2g", elna2g, 0, fb);
+	ENTRY(0x00000700, u8, pre, "elna5g", elna5g, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "phycal_tempdelta", phycal_tempdelta, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "temps_period", temps_period, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "temps_hysteresis", temps_hysteresis, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "measpower1", measpower1, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "measpower2", measpower2, 0, fb);
+
+	ENTRY(0x000001f0, u16, pre, "cck2gpo", cck2gpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm2gpo", ofdm2gpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm5gpo", ofdm5gpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm5glpo", ofdm5glpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm5ghpo", ofdm5ghpo, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo0", mcs2gpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo1", mcs2gpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo2", mcs2gpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo3", mcs2gpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo4", mcs2gpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo5", mcs2gpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo6", mcs2gpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo7", mcs2gpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo0", mcs5gpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo1", mcs5gpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo2", mcs5gpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo3", mcs5gpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo4", mcs5gpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo5", mcs5gpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo6", mcs5gpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo7", mcs5gpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo0", mcs5glpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo1", mcs5glpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo2", mcs5glpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo3", mcs5glpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo4", mcs5glpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo5", mcs5glpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo6", mcs5glpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo7", mcs5glpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo0", mcs5ghpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo1", mcs5ghpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo2", mcs5ghpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo3", mcs5ghpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo4", mcs5ghpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo5", mcs5ghpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo6", mcs5ghpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo7", mcs5ghpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "cddpo", cddpo, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "stbcpo", stbcpo, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "bw40po", bw40po, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "bwduppo", bwduppo, 0, fb);
+
+	ENTRY(0xfffffe00, u16, pre, "cckbw202gpo", cckbw202gpo, 0, fb);
+	ENTRY(0xfffffe00, u16, pre, "cckbw20ul2gpo", cckbw20ul2gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw202gpo", legofdmbw202gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul2gpo", legofdmbw20ul2gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw205glpo", legofdmbw205glpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5glpo", legofdmbw20ul5glpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw205gmpo", legofdmbw205gmpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5gmpo", legofdmbw20ul5gmpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw205ghpo", legofdmbw205ghpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5ghpo", legofdmbw20ul5ghpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw202gpo", mcsbw202gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul2gpo", mcsbw20ul2gpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw402gpo", mcsbw402gpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw205glpo", mcsbw205glpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul5glpo", mcsbw20ul5glpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw405glpo", mcsbw405glpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw205gmpo", mcsbw205gmpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul5gmpo", mcsbw20ul5gmpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw405gmpo", mcsbw405gmpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw205ghpo", mcsbw205ghpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul5ghpo", mcsbw20ul5ghpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw405ghpo", mcsbw405ghpo, 0, fb);
+	ENTRY(0x00000600, u16, pre, "mcs32po", mcs32po, 0, fb);
+	ENTRY(0x00000600, u16, pre, "legofdm40duppo", legofdm40duppo, 0, fb);
+	ENTRY(0x00000700, u8, pre, "pcieingress_war", pcieingress_war, 0, fb);
+
+	/* TODO: rev 11 support */
+	ENTRY(0x00000700, u8, pre, "rxgainerr2ga0", rxgainerr2ga[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr2ga1", rxgainerr2ga[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr2ga2", rxgainerr2ga[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gla0", rxgainerr5gla[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gla1", rxgainerr5gla[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gla2", rxgainerr5gla[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gma0", rxgainerr5gma[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gma1", rxgainerr5gma[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gma2", rxgainerr5gma[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gha0", rxgainerr5gha[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gha1", rxgainerr5gha[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gha2", rxgainerr5gha[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gua0", rxgainerr5gua[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gua1", rxgainerr5gua[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gua2", rxgainerr5gua[2], 0, fb);
+
+	ENTRY(0xfffffe00, u8, pre, "sar2g", sar2g, 0, fb);
+	ENTRY(0xfffffe00, u8, pre, "sar5g", sar5g, 0, fb);
+
+	/* TODO: rev 11 support */
+	ENTRY(0x00000700, u8, pre, "noiselvl2ga0", noiselvl2ga[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl2ga1", noiselvl2ga[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl2ga2", noiselvl2ga[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gla0", noiselvl5gla[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gla1", noiselvl5gla[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gla2", noiselvl5gla[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gma0", noiselvl5gma[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gma1", noiselvl5gma[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gma2", noiselvl5gma[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gha0", noiselvl5gha[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gha1", noiselvl5gha[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gha2", noiselvl5gha[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gua0", noiselvl5gua[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gua1", noiselvl5gua[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gua2", noiselvl5gua[2], 0, fb);
+}
+#undef ENTRY /* It's specififc, uses local variable, don't use it (again). */
+
+static void bcm47xx_fill_sprom_path_r4589(struct ssb_sprom *sprom,
+					  const char *prefix, bool fallback)
+{
+	char postfix[2];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
+		struct ssb_sprom_core_pwr_info *pwr_info;
+
+		pwr_info = &sprom->core_pwr_info[i];
+
+		snprintf(postfix, sizeof(postfix), "%i", i);
+		nvram_read_u8(prefix, postfix, "maxp2ga",
+			      &pwr_info->maxpwr_2g, 0, fallback);
+		nvram_read_u8(prefix, postfix, "itt2ga",
+			      &pwr_info->itssi_2g, 0, fallback);
+		nvram_read_u8(prefix, postfix, "itt5ga",
+			      &pwr_info->itssi_5g, 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa2gw0a",
+			       &pwr_info->pa_2g[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa2gw1a",
+			       &pwr_info->pa_2g[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa2gw2a",
+			       &pwr_info->pa_2g[2], 0, fallback);
+		nvram_read_u8(prefix, postfix, "maxp5ga",
+			      &pwr_info->maxpwr_5g, 0, fallback);
+		nvram_read_u8(prefix, postfix, "maxp5gha",
+			      &pwr_info->maxpwr_5gh, 0, fallback);
+		nvram_read_u8(prefix, postfix, "maxp5gla",
+			      &pwr_info->maxpwr_5gl, 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw0a",
+			       &pwr_info->pa_5g[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw1a",
+			       &pwr_info->pa_5g[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw2a",
+			       &pwr_info->pa_5g[2], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw0a",
+			       &pwr_info->pa_5gl[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw1a",
+			       &pwr_info->pa_5gl[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw2a",
+			       &pwr_info->pa_5gl[2], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw0a",
+			       &pwr_info->pa_5gh[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw1a",
+			       &pwr_info->pa_5gh[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw2a",
+			       &pwr_info->pa_5gh[2], 0, fallback);
+	}
+}
+
+static void bcm47xx_fill_sprom_path_r45(struct ssb_sprom *sprom,
+					const char *prefix, bool fallback)
+{
+	char postfix[2];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
+		struct ssb_sprom_core_pwr_info *pwr_info;
+
+		pwr_info = &sprom->core_pwr_info[i];
+
+		snprintf(postfix, sizeof(postfix), "%i", i);
+		nvram_read_u16(prefix, postfix, "pa2gw3a",
+			       &pwr_info->pa_2g[3], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw3a",
+			       &pwr_info->pa_5g[3], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw3a",
+			       &pwr_info->pa_5gl[3], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw3a",
+			       &pwr_info->pa_5gh[3], 0, fallback);
+	}
+}
+
+static bool bcm47xx_is_valid_mac(u8 *mac)
+{
+	return mac && !(mac[0] == 0x00 && mac[1] == 0x90 && mac[2] == 0x4c);
+}
+
+static int bcm47xx_increase_mac_addr(u8 *mac, u8 num)
+{
+	u8 *oui = mac + ETH_ALEN/2 - 1;
+	u8 *p = mac + ETH_ALEN - 1;
+
+	do {
+		(*p) += num;
+		if (*p > num)
+			break;
+		p--;
+		num = 1;
+	} while (p != oui);
+
+	if (p == oui) {
+		pr_err("unable to fetch mac address\n");
+		return -ENOENT;
+	}
+	return 0;
+}
+
+static int mac_addr_used = 2;
+
+static void bcm47xx_fill_sprom_ethernet(struct ssb_sprom *sprom,
+					const char *prefix, bool fallback)
+{
+	bool fb = fallback;
+
+	nvram_read_macaddr(prefix, "et0macaddr", sprom->et0mac, fallback);
+	nvram_read_u8(prefix, NULL, "et0mdcport", &sprom->et0mdcport, 0,
+		      fallback);
+	nvram_read_u8(prefix, NULL, "et0phyaddr", &sprom->et0phyaddr, 0,
+		      fallback);
+
+	nvram_read_macaddr(prefix, "et1macaddr", sprom->et1mac, fallback);
+	nvram_read_u8(prefix, NULL, "et1mdcport", &sprom->et1mdcport, 0,
+		      fallback);
+	nvram_read_u8(prefix, NULL, "et1phyaddr", &sprom->et1phyaddr, 0,
+		      fallback);
+
+	nvram_read_macaddr(prefix, "et2macaddr", sprom->et2mac, fb);
+	nvram_read_u8(prefix, NULL, "et2mdcport", &sprom->et2mdcport, 0, fb);
+	nvram_read_u8(prefix, NULL, "et2phyaddr", &sprom->et2phyaddr, 0, fb);
+
+	nvram_read_macaddr(prefix, "macaddr", sprom->il0mac, fallback);
+	nvram_read_macaddr(prefix, "il0macaddr", sprom->il0mac, fallback);
+
+	/* The address prefix 00:90:4C is used by Broadcom in their initial
+	 * configuration. When a mac address with the prefix 00:90:4C is used
+	 * all devices from the same series are sharing the same mac address.
+	 * To prevent mac address collisions we replace them with a mac address
+	 * based on the base address.
+	 */
+	if (!bcm47xx_is_valid_mac(sprom->il0mac)) {
+		u8 mac[6];
+
+		nvram_read_macaddr(NULL, "et0macaddr", mac, false);
+		if (bcm47xx_is_valid_mac(mac)) {
+			int err = bcm47xx_increase_mac_addr(mac, mac_addr_used);
+
+			if (!err) {
+				ether_addr_copy(sprom->il0mac, mac);
+				mac_addr_used++;
+			}
+		}
+	}
+}
+
+static void bcm47xx_fill_board_data(struct ssb_sprom *sprom, const char *prefix,
+				    bool fallback)
+{
+	nvram_read_u32_2(prefix, "boardflags", &sprom->boardflags_lo,
+			 &sprom->boardflags_hi, fallback);
+	nvram_read_u32_2(prefix, "boardflags2", &sprom->boardflags2_lo,
+			 &sprom->boardflags2_hi, fallback);
+}
+
+void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix,
+			bool fallback)
+{
+	bcm47xx_fill_sprom_ethernet(sprom, prefix, fallback);
+	bcm47xx_fill_board_data(sprom, prefix, fallback);
+
+	nvram_read_u8(prefix, NULL, "sromrev", &sprom->revision, 0, fallback);
+
+	/* Entries requiring custom functions */
+	nvram_read_alpha2(prefix, "ccode", sprom->alpha2, fallback);
+	if (sprom->revision >= 3)
+		nvram_read_leddc(prefix, "leddc", &sprom->leddc_on_time,
+				 &sprom->leddc_off_time, fallback);
+
+	switch (sprom->revision) {
+	case 4:
+	case 5:
+		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
+		bcm47xx_fill_sprom_path_r45(sprom, prefix, fallback);
+		break;
+	case 8:
+	case 9:
+		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
+		break;
+	}
+
+	bcm47xx_sprom_fill_auto(sprom, prefix, fallback);
+}
+
+#if IS_BUILTIN(CONFIG_SSB) && IS_ENABLED(CONFIG_SSB_SPROM)
+static int bcm47xx_get_sprom_ssb(struct ssb_bus *bus, struct ssb_sprom *out)
+{
+	char prefix[10];
+
+	switch (bus->bustype) {
+	case SSB_BUSTYPE_SSB:
+		bcm47xx_fill_sprom(out, NULL, false);
+		return 0;
+	case SSB_BUSTYPE_PCI:
+		memset(out, 0, sizeof(struct ssb_sprom));
+		snprintf(prefix, sizeof(prefix), "pci/%u/%u/",
+			 bus->host_pci->bus->number + 1,
+			 PCI_SLOT(bus->host_pci->devfn));
+		bcm47xx_fill_sprom(out, prefix, false);
+		return 0;
+	default:
+		pr_warn("Unable to fill SPROM for given bustype.\n");
+		return -EINVAL;
+	}
+}
+#endif
+
+#if IS_BUILTIN(CONFIG_BCMA)
+/*
+ * Having many NVRAM entries for PCI devices led to repeating prefixes like
+ * pci/1/1/ all the time and wasting flash space. So at some point Broadcom
+ * decided to introduce prefixes like 0: 1: 2: etc.
+ * If we find e.g. devpath0=pci/2/1 or devpath0=pci/2/1/ we should use 0:
+ * instead of pci/2/1/.
+ */
+static void bcm47xx_sprom_apply_prefix_alias(char *prefix, size_t prefix_size)
+{
+	size_t prefix_len = strlen(prefix);
+	size_t short_len = prefix_len - 1;
+	char nvram_var[10];
+	char buf[20];
+	int i;
+
+	/* Passed prefix has to end with a slash */
+	if (prefix_len <= 0 || prefix[prefix_len - 1] != '/')
+		return;
+
+	for (i = 0; i < 3; i++) {
+		if (snprintf(nvram_var, sizeof(nvram_var), "devpath%d", i) <= 0)
+			continue;
+		if (bcm47xx_nvram_getenv(nvram_var, buf, sizeof(buf)) < 0)
+			continue;
+		if (!strcmp(buf, prefix) ||
+		    (short_len && strlen(buf) == short_len && !strncmp(buf, prefix, short_len))) {
+			snprintf(prefix, prefix_size, "%d:", i);
+			return;
+		}
+	}
+}
+
+static int bcm47xx_get_sprom_bcma(struct bcma_bus *bus, struct ssb_sprom *out)
+{
+	struct bcma_boardinfo *binfo = &bus->boardinfo;
+	struct bcma_device *core;
+	char buf[10];
+	char *prefix;
+	bool fallback = false;
+
+	switch (bus->hosttype) {
+	case BCMA_HOSTTYPE_PCI:
+		memset(out, 0, sizeof(struct ssb_sprom));
+		/* On BCM47XX all PCI buses share the same domain */
+		if (config_enabled(CONFIG_BCM47XX))
+			snprintf(buf, sizeof(buf), "pci/%u/%u/",
+				 bus->host_pci->bus->number + 1,
+				 PCI_SLOT(bus->host_pci->devfn));
+		else
+			snprintf(buf, sizeof(buf), "pci/%u/%u/",
+				 pci_domain_nr(bus->host_pci->bus) + 1,
+				 bus->host_pci->bus->number);
+		bcm47xx_sprom_apply_prefix_alias(buf, sizeof(buf));
+		prefix = buf;
+		break;
+	case BCMA_HOSTTYPE_SOC:
+		memset(out, 0, sizeof(struct ssb_sprom));
+		core = bcma_find_core(bus, BCMA_CORE_80211);
+		if (core) {
+			snprintf(buf, sizeof(buf), "sb/%u/",
+				 core->core_index);
+			prefix = buf;
+			fallback = true;
+		} else {
+			prefix = NULL;
+		}
+		break;
+	default:
+		pr_warn("Unable to fill SPROM for given bustype.\n");
+		return -EINVAL;
+	}
+
+	nvram_read_u16(prefix, NULL, "boardvendor", &binfo->vendor, 0, true);
+	if (!binfo->vendor)
+		binfo->vendor = SSB_BOARDVENDOR_BCM;
+	nvram_read_u16(prefix, NULL, "boardtype", &binfo->type, 0, true);
+
+	bcm47xx_fill_sprom(out, prefix, fallback);
+
+	return 0;
+}
+#endif
+
+static unsigned int bcm47xx_sprom_registered;
+
+/*
+ * On bcm47xx we need to register SPROM fallback handler very early, so we can't
+ * use anything like platform device / driver for this.
+ */
+int bcm47xx_sprom_register_fallbacks(void)
+{
+	if (bcm47xx_sprom_registered)
+		return 0;
+
+#if IS_BUILTIN(CONFIG_SSB) && IS_ENABLED(CONFIG_SSB_SPROM)
+	if (ssb_arch_register_fallback_sprom(&bcm47xx_get_sprom_ssb))
+		pr_warn("Failed to register ssb SPROM handler\n");
+#endif
+
+#if IS_BUILTIN(CONFIG_BCMA)
+	if (bcma_arch_register_fallback_sprom(&bcm47xx_get_sprom_bcma))
+		pr_warn("Failed to register bcma SPROM handler\n");
+#endif
+
+	bcm47xx_sprom_registered = 1;
+
+	return 0;
+}
+
+fs_initcall(bcm47xx_sprom_register_fallbacks);
diff --git a/include/linux/bcm47xx_sprom.h b/include/linux/bcm47xx_sprom.h
new file mode 100644
index 000000000000..c06b47c84e1a
--- /dev/null
+++ b/include/linux/bcm47xx_sprom.h
@@ -0,0 +1,24 @@
+/*
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ */
+
+#ifndef __BCM47XX_SPROM_H
+#define __BCM47XX_SPROM_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_BCM47XX_SPROM
+int bcm47xx_sprom_register_fallbacks(void);
+#else
+static inline int bcm47xx_sprom_register_fallbacks(void)
+{
+	return -ENOTSUPP;
+};
+#endif
+
+#endif /* __BCM47XX_SPROM_H */
-- 
cgit v1.2.3


From 835d2b452969820fd67a755a2c01fb6e12822448 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:28 +0000
Subject: irqchip: mips-gic: Provide VP ID accessor

Provide a gic_read_local_vp_id() function to read the VCNUM field of the
GICs local VP_IDENT register. This will be used by a further patch to
check that the value reported by the GIC matches up with the kernels
calculation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Bresticker <abrestic@chromium.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12334/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   |  8 ++++++++
 include/linux/irqchip/mips-gic.h | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index e28311f237f5..c089f49b63fb 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -246,6 +246,14 @@ void gic_stop_count(void)
 
 #endif
 
+unsigned gic_read_local_vp_id(void)
+{
+	unsigned long ident;
+
+	ident = gic_read(GIC_REG(VPE_LOCAL, GIC_VP_IDENT));
+	return ident & GIC_VP_IDENT_VCNUM_MSK;
+}
+
 static bool gic_local_irq_is_routable(int intr)
 {
 	u32 vpe_ctl;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 80f89e4a29ac..81f930b0bca9 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -103,6 +103,7 @@
 #define GIC_VPE_SWINT0_MAP_OFS		0x0054
 #define GIC_VPE_SWINT1_MAP_OFS		0x0058
 #define GIC_VPE_OTHER_ADDR_OFS		0x0080
+#define GIC_VP_IDENT_OFS		0x0088
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
 #define GIC_VPE_WD_INITIAL0_OFS		0x0098
@@ -211,6 +212,10 @@
 #define GIC_VPE_SMASK_FDC_SHF		6
 #define GIC_VPE_SMASK_FDC_MSK		(MSK(1) << GIC_VPE_SMASK_FDC_SHF)
 
+/* GIC_VP_IDENT fields */
+#define GIC_VP_IDENT_VCNUM_SHF		0
+#define GIC_VP_IDENT_VCNUM_MSK		(MSK(6) << GIC_VP_IDENT_VCNUM_SHF)
+
 /* GIC nomenclature for Core Interrupt Pins. */
 #define GIC_CPU_INT0		0 /* Core Interrupt 2 */
 #define GIC_CPU_INT1		1 /* .		      */
@@ -278,4 +283,16 @@ static inline int gic_get_usm_range(struct resource *gic_usm_res)
 
 #endif /* CONFIG_MIPS_GIC */
 
+/**
+ * gic_read_local_vp_id() - read the local VPs VCNUM
+ *
+ * Read the VCNUM of the local VP from the GIC_VP_IDENT register and
+ * return it to the caller. This ID should be used to refer to the VP
+ * via the GICs VP-other region, or when calculating an offset to a
+ * bit representing the VP in interrupt masks.
+ *
+ * Return: The VCNUM value for the local VP.
+ */
+extern unsigned gic_read_local_vp_id(void);
+
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From e55d5312444087eb6bfb34c1cd5f6e0bf626cf26 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 11 Feb 2016 13:36:54 +0100
Subject: crash_dump: Add vmcore_elf32_check_arch

parse_crash_elf{32|64}_headers will check the headers via the
elf_check_arch respectively vmcore_elf64_check_arch macro.

The MIPS architecture implements those two macros differently.
In order to make the differentiation more explicit, let's introduce
an vmcore_elf32_check_arch to allow the archs to overwrite it.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Suggested-by: Maciej W. Rozycki <macro@imgtec.com>
Reviewed-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12535/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 fs/proc/vmcore.c           | 2 +-
 include/linux/crash_dump.h | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8afe10cf7df8..8ab782d8b33d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1071,7 +1071,7 @@ static int __init parse_crash_elf32_headers(void)
 	/* Do some basic Verification. */
 	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
 		(ehdr.e_type != ET_CORE) ||
-		!elf_check_arch(&ehdr) ||
+		!vmcore_elf32_check_arch(&ehdr) ||
 		ehdr.e_ident[EI_CLASS] != ELFCLASS32||
 		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
 		ehdr.e_version != EV_CURRENT ||
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 3849fce7ecfe..3873697ba21c 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -34,9 +34,13 @@ void vmcore_cleanup(void);
 
 /*
  * Architecture code can redefine this if there are any special checks
- * needed for 64-bit ELF vmcores. In case of 32-bit only architecture,
- * this can be set to zero.
+ * needed for 32-bit ELF or 64-bit ELF vmcores.  In case of 32-bit
+ * only architecture, vmcore_elf64_check_arch can be set to zero.
  */
+#ifndef vmcore_elf32_check_arch
+#define vmcore_elf32_check_arch(x) elf_check_arch(x)
+#endif
+
 #ifndef vmcore_elf64_check_arch
 #define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 #endif
-- 
cgit v1.2.3


From ca9eb49aa9562eaadf3cea071ec7018ad6800425 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 8 Feb 2016 18:43:50 +0000
Subject: SIGNAL: Move generic copy_siginfo() to signal.h

The generic copy_siginfo() is currently defined in
asm-generic/siginfo.h, after including uapi/asm-generic/siginfo.h which
defines the generic struct siginfo. However this makes it awkward for an
architecture to use it if it has to define its own struct siginfo (e.g.
MIPS and potentially IA64), since it means that asm-generic/siginfo.h
can only be included after defining the arch-specific siginfo, which may
be problematic if the arch-specific definition needs definitions from
uapi/asm-generic/siginfo.h.

It is possible to work around this by first including
uapi/asm-generic/siginfo.h to get the constants before defining the
arch-specific siginfo, and include asm-generic/siginfo.h after. However
uapi headers can't be included by other uapi headers, so that first
include has to be in an ifdef __kernel__, with the non __kernel__ case
including the non-UAPI header instead.

Instead of that mess, move the generic copy_siginfo() definition into
linux/signal.h, which allows an arch-specific uapi/asm/siginfo.h to
include asm-generic/siginfo.h and define the arch-specific siginfo, and
for the generic copy_siginfo() to see that arch-specific definition.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Petr Malat <oss@malat.biz>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Christopher Ferris <cferris@google.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # 4.0-
Patchwork: https://patchwork.linux-mips.org/patch/12478/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 include/asm-generic/siginfo.h | 15 ---------------
 include/linux/signal.h        | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 3d1a3af5cf59..a2508a8f9a9c 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -17,21 +17,6 @@
 struct siginfo;
 void do_schedule_next_timer(struct siginfo *info);
 
-#ifndef HAVE_ARCH_COPY_SIGINFO
-
-#include <linux/string.h>
-
-static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
-{
-	if (from->si_code < 0)
-		memcpy(to, from, sizeof(*to));
-	else
-		/* _sigchld is currently the largest know union member */
-		memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
-}
-
-#endif
-
 extern int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from);
 
 #endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 92557bbce7e7..d80259afb9e5 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -28,6 +28,21 @@ struct sigpending {
 	sigset_t signal;
 };
 
+#ifndef HAVE_ARCH_COPY_SIGINFO
+
+#include <linux/string.h>
+
+static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
+{
+	if (from->si_code < 0)
+		memcpy(to, from, sizeof(*to));
+	else
+		/* _sigchld is currently the largest know union member */
+		memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
+}
+
+#endif
+
 /*
  * Define some primitives to manipulate sigset_t.
  */
-- 
cgit v1.2.3


From 3491caf2755e9f312666712510d80b00c81ff247 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 13 May 2016 12:16:35 +0200
Subject: KVM: halt_polling: provide a way to qualify wakeups during poll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some wakeups should not be considered a sucessful poll. For example on
s390 I/O interrupts are usually floating, which means that _ALL_ CPUs
would be considered runnable - letting all vCPUs poll all the time for
transactional like workload, even if one vCPU would be enough.
This can result in huge CPU usage for large guests.
This patch lets architectures provide a way to qualify wakeups if they
should be considered a good/bad wakeups in regard to polls.

For s390 the implementation will fence of halt polling for anything but
known good, single vCPU events. The s390 implementation for floating
interrupts does a wakeup for one vCPU, but the interrupt will be delivered
by whatever CPU checks first for a pending interrupt. We prefer the
woken up CPU by marking the poll of this CPU as "good" poll.
This code will also mark several other wakeup reasons like IPI or
expired timers as "good". This will of course also mark some events as
not sucessful. As  KVM on z runs always as a 2nd level hypervisor,
we prefer to not poll, unless we are really sure, though.

This patch successfully limits the CPU usage for cases like uperf 1byte
transactional ping pong workload or wakeup heavy workload like OLTP
while still providing a proper speedup.

This also introduced a new vcpu stat "halt_poll_no_tuning" that marks
wakeups that are considered not good for polling.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Radim Krčmář <rkrcmar@redhat.com> (for an earlier version)
Cc: David Matlack <dmatlack@google.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
[Rename config symbol. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h     |  2 ++
 arch/arm64/include/asm/kvm_host.h   |  2 ++
 arch/mips/include/asm/kvm_host.h    |  2 ++
 arch/mips/kvm/mips.c                |  1 +
 arch/powerpc/include/asm/kvm_host.h |  2 ++
 arch/powerpc/kvm/book3s.c           |  1 +
 arch/powerpc/kvm/booke.c            |  1 +
 arch/s390/include/asm/kvm_host.h    |  3 +++
 arch/s390/kvm/Kconfig               |  1 +
 arch/s390/kvm/interrupt.c           |  5 +++++
 arch/s390/kvm/kvm-s390.c            |  6 ++++++
 arch/x86/include/asm/kvm_host.h     |  2 ++
 arch/x86/kvm/x86.c                  |  1 +
 include/linux/kvm_host.h            | 15 +++++++++++++++
 include/trace/events/kvm.h          | 11 +++++++----
 virt/kvm/Kconfig                    |  3 +++
 virt/kvm/kvm_main.c                 |  8 ++++++--
 17 files changed, 60 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 385070180c25..4cd8732796ab 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -187,6 +187,7 @@ struct kvm_vm_stat {
 struct kvm_vcpu_stat {
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 hvc_exit_stat;
 	u64 wfe_exit_stat;
@@ -282,6 +283,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 static inline void kvm_arm_init_debug(void) {}
 static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f5c6bd2541ef..d49399d9890d 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -293,6 +293,7 @@ struct kvm_vm_stat {
 struct kvm_vcpu_stat {
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 hvc_exit_stat;
 	u64 wfe_exit_stat;
@@ -357,6 +358,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 942b8f6bf35b..9a37a1044032 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -122,6 +122,7 @@ struct kvm_vcpu_stat {
 	u32 flush_dcache_exits;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 };
 
@@ -812,5 +813,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 #endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 23b209463238..dc052fb5c7a2 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -56,6 +56,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
 	{ "halt_wakeup",  VCPU_STAT(halt_wakeup),	 KVM_STAT_VCPU },
 	{NULL}
 };
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a07645c17818..ec35af34a3fb 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -116,6 +116,7 @@ struct kvm_vcpu_stat {
 	u32 ext_intr_exits;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 dbell_exits;
 	u32 gdbell_exits;
@@ -727,5 +728,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b34220d2aa42..47018fcbf7d6 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -54,6 +54,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "queue_intr",  VCPU_STAT(queue_intr) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "pf_storage",  VCPU_STAT(pf_storage) },
 	{ "sp_storage",  VCPU_STAT(sp_storage) },
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4d66f44a1657..4afae695899a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -64,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "doorbell", VCPU_STAT(dbell_exits) },
 	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9282ccf1d136..53d794538067 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -247,6 +247,7 @@ struct kvm_vcpu_stat {
 	u32 exit_instruction;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 instruction_lctl;
 	u32 instruction_lctlg;
@@ -696,4 +697,6 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
+void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu);
+
 #endif
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 5ea5af3c7db7..b1900239b0ab 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
+	select HAVE_KVM_INVALID_WAKEUPS
 	select SRCU
 	select KVM_VFIO
 	---help---
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index e55040467eb5..5a80af740d3e 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -977,6 +977,11 @@ no_timer:
 
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 {
+	/*
+	 * We cannot move this into the if, as the CPU might be already
+	 * in kvm_vcpu_block without having the waitqueue set (polling)
+	 */
+	vcpu->valid_wakeup = true;
 	if (swait_active(&vcpu->wq)) {
 		/*
 		 * The vcpu gave up the cpu voluntarily, mark it as a good
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c597201a5ca9..6d8ec3ac9dd8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -65,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
 	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
@@ -2992,6 +2993,11 @@ static inline unsigned long nonhyp_mask(int i)
 	return 0x0000ffffffffffffUL >> (nonhyp_fai << 4);
 }
 
+void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu)
+{
+	vcpu->valid_wakeup = false;
+}
+
 static int __init kvm_s390_init(void)
 {
 	int i;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c66e26280707..c99494b4bdf7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -803,6 +803,7 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
@@ -1342,5 +1343,6 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c774cdf553c..bcef92fc41d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -161,6 +161,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_exits", VCPU_STAT(halt_exits) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 92a0229044fb..bbcd921d7cb0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -229,6 +229,7 @@ struct kvm_vcpu {
 	sigset_t sigset;
 	struct kvm_vcpu_stat stat;
 	unsigned int halt_poll_ns;
+	bool valid_wakeup;
 
 #ifdef CONFIG_HAS_IOMEM
 	int mmio_needed;
@@ -1196,4 +1197,18 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				  uint32_t guest_irq, bool set);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
 
+#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
+/* If we wakeup during the poll time, was it a sucessful poll? */
+static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
+{
+	return vcpu->valid_wakeup;
+}
+
+#else
+static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
+{
+	return true;
+}
+#endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */
+
 #endif
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index aa69253ecc7d..526fb3d2e43a 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -38,22 +38,25 @@ TRACE_EVENT(kvm_userspace_exit,
 );
 
 TRACE_EVENT(kvm_vcpu_wakeup,
-	    TP_PROTO(__u64 ns, bool waited),
-	    TP_ARGS(ns, waited),
+	    TP_PROTO(__u64 ns, bool waited, bool valid),
+	    TP_ARGS(ns, waited, valid),
 
 	TP_STRUCT__entry(
 		__field(	__u64,		ns		)
 		__field(	bool,		waited		)
+		__field(	bool,		valid		)
 	),
 
 	TP_fast_assign(
 		__entry->ns		= ns;
 		__entry->waited		= waited;
+		__entry->valid		= valid;
 	),
 
-	TP_printk("%s time %lld ns",
+	TP_printk("%s time %lld ns, polling %s",
 		  __entry->waited ? "wait" : "poll",
-		  __entry->ns)
+		  __entry->ns,
+		  __entry->valid ? "valid" : "invalid")
 );
 
 #if defined(CONFIG_HAVE_KVM_IRQFD)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7a79b6853583..e5d6108f5e85 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -41,6 +41,9 @@ config KVM_VFIO
 config HAVE_KVM_ARCH_TLB_FLUSH_ALL
        bool
 
+config HAVE_KVM_INVALID_WAKEUPS
+       bool
+
 config KVM_GENERIC_DIRTYLOG_READ_PROTECT
        bool
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ed3d9bb18a56..21f6498d52e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2028,6 +2028,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 			 */
 			if (kvm_vcpu_check_block(vcpu) < 0) {
 				++vcpu->stat.halt_successful_poll;
+				if (!vcpu_valid_wakeup(vcpu))
+					++vcpu->stat.halt_poll_invalid;
 				goto out;
 			}
 			cur = ktime_get();
@@ -2057,7 +2059,8 @@ out:
 		if (block_ns <= vcpu->halt_poll_ns)
 			;
 		/* we had a long block, shrink polling */
-		else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+		else if (!vcpu_valid_wakeup(vcpu) ||
+			(vcpu->halt_poll_ns && block_ns > halt_poll_ns))
 			shrink_halt_poll_ns(vcpu);
 		/* we had a short halt and our poll time is too small */
 		else if (vcpu->halt_poll_ns < halt_poll_ns &&
@@ -2066,7 +2069,8 @@ out:
 	} else
 		vcpu->halt_poll_ns = 0;
 
-	trace_kvm_vcpu_wakeup(block_ns, waited);
+	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
+	kvm_arch_vcpu_block_finish(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-- 
cgit v1.2.3


From d9e4084f6c9746e51a78a4d7ebf4983023289b32 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 4 May 2016 10:53:47 -0400
Subject: svcrdma: Generalize svc_rdma_xdr_decode_req()

Clean up: Pass in just the piece of the svc_rqst that is needed
here.

While we're in the area, add an informative documenting comment.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 32 ++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  2 +-
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 3081339968c3..d6917b896d3a 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -199,7 +199,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_marshal.c */
-extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg *, struct svc_rqst *);
+extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
 				     struct rpcrdma_msg *,
 				     enum rpcrdma_errcode, __be32 *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 765bca47c74d..0ba9887f3e22 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
 	return (__be32 *)&ary->wc_array[nchunks];
 }
 
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
+	struct rpcrdma_msg *rmsgp;
 	__be32 *va, *vaend;
 	unsigned int len;
 	u32 hdr_len;
 
 	/* Verify that there's enough bytes for header + something */
-	if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
 		dprintk("svcrdma: header too short = %d\n",
-			rqstp->rq_arg.len);
+			rq_arg->len);
 		return -EINVAL;
 	}
 
+	rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
 	if (rmsgp->rm_vers != rpcrdma_version) {
 		dprintk("%s: bad version %u\n", __func__,
 			be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
 
 		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-		rqstp->rq_arg.head[0].iov_base = va;
+		rq_arg->head[0].iov_base = va;
 		len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-		rqstp->rq_arg.head[0].iov_len -= len;
-		if (len > rqstp->rq_arg.len)
+		rq_arg->head[0].iov_len -= len;
+		if (len > rq_arg->len)
 			return -EINVAL;
 		return len;
 	default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 	 * chunk list and a reply chunk list.
 	 */
 	va = &rmsgp->rm_body.rm_chunks[0];
-	vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+	vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
 	va = decode_read_list(va, vaend);
 	if (!va) {
 		dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 		return -EINVAL;
 	}
 
-	rqstp->rq_arg.head[0].iov_base = va;
+	rq_arg->head[0].iov_base = va;
 	hdr_len = (unsigned long)va - (unsigned long)rmsgp;
-	rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+	rq_arg->head[0].iov_len -= hdr_len;
 	return hdr_len;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 1b72f351fbd3..c984b0aaecb1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -636,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	/* Decode the RDMA header. */
 	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-	ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
 	if (ret < 0)
 		goto out_err;
 	if (ret == 0)
-- 
cgit v1.2.3


From c4263233f30e72f2645ff83c9074c994f88b015a Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 3 May 2016 23:22:50 +1000
Subject: drivers/of: Specify parent node in of_fdt_unflatten_tree()

This adds one more argument to of_fdt_unflatten_tree() to specify
the parent node of the FDT blob that is going to be unflattened.
In the result, the function can be used to unflatten FDT blob that
represents device sub-tree in PowerNV PCI hotplug driver.

Cc: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |  2 +-
 drivers/of/fdt.c                             | 14 ++++++++++----
 drivers/of/unittest.c                        |  2 +-
 include/linux/of_fdt.h                       |  1 +
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 106679bca6cb..f9c79dabce20 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -157,7 +157,7 @@ struct device_node * __init tilcdc_get_overlay(struct kfree_table *kft)
 	if (!overlay_data || kfree_table_add(kft, overlay_data))
 		return NULL;
 
-	of_fdt_unflatten_tree(overlay_data, &overlay);
+	of_fdt_unflatten_tree(overlay_data, NULL, &overlay);
 	if (!overlay) {
 		pr_warn("%s: Unfattening overlay tree failed\n", __func__);
 		return NULL;
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 9c3e52d0d570..0d53687d4b8c 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -450,11 +450,13 @@ static int unflatten_dt_nodes(const void *blob,
  * pointers of the nodes so the normal device-tree walking functions
  * can be used.
  * @blob: The blob to expand
+ * @dad: Parent device node
  * @mynodes: The device_node tree created by the call
  * @dt_alloc: An allocator that provides a virtual address to memory
  * for the resulting tree
  */
 static void __unflatten_device_tree(const void *blob,
+			     struct device_node *dad,
 			     struct device_node **mynodes,
 			     void * (*dt_alloc)(u64 size, u64 align))
 {
@@ -479,7 +481,7 @@ static void __unflatten_device_tree(const void *blob,
 	}
 
 	/* First pass, scan for size */
-	size = unflatten_dt_nodes(blob, NULL, NULL, NULL);
+	size = unflatten_dt_nodes(blob, NULL, dad, NULL);
 	if (size < 0)
 		return;
 
@@ -495,7 +497,7 @@ static void __unflatten_device_tree(const void *blob,
 	pr_debug("  unflattening %p...\n", mem);
 
 	/* Second pass, do actual unflattening */
-	unflatten_dt_nodes(blob, mem, NULL, mynodes);
+	unflatten_dt_nodes(blob, mem, dad, mynodes);
 	if (be32_to_cpup(mem + size) != 0xdeadbeef)
 		pr_warning("End of tree marker overwritten: %08x\n",
 			   be32_to_cpup(mem + size));
@@ -512,6 +514,9 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
 
 /**
  * of_fdt_unflatten_tree - create tree of device_nodes from flat blob
+ * @blob: Flat device tree blob
+ * @dad: Parent device node
+ * @mynodes: The device tree created by the call
  *
  * unflattens the device-tree passed by the firmware, creating the
  * tree of struct device_node. It also fills the "name" and "type"
@@ -519,10 +524,11 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
  * can be used.
  */
 void of_fdt_unflatten_tree(const unsigned long *blob,
+			struct device_node *dad,
 			struct device_node **mynodes)
 {
 	mutex_lock(&of_fdt_unflatten_mutex);
-	__unflatten_device_tree(blob, mynodes, &kernel_tree_alloc);
+	__unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
 	mutex_unlock(&of_fdt_unflatten_mutex);
 }
 EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
@@ -1195,7 +1201,7 @@ bool __init early_init_dt_scan(void *params)
  */
 void __init unflatten_device_tree(void)
 {
-	__unflatten_device_tree(initial_boot_params, &of_root,
+	__unflatten_device_tree(initial_boot_params, NULL, &of_root,
 				early_init_dt_alloc_memory_arch);
 
 	/* Get pointer to "/chosen" and "/aliases" nodes for use everywhere */
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index e986e6ee52e0..8c0f11c3153c 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -921,7 +921,7 @@ static int __init unittest_data_add(void)
 			"not running tests\n", __func__);
 		return -ENOMEM;
 	}
-	of_fdt_unflatten_tree(unittest_data, &unittest_data_node);
+	of_fdt_unflatten_tree(unittest_data, NULL, &unittest_data_node);
 	if (!unittest_data_node) {
 		pr_warn("%s: No tree to attach; not running tests\n", __func__);
 		return -ENODATA;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 2fbe8682a66f..1bffcbd56f8e 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -38,6 +38,7 @@ extern bool of_fdt_is_big_endian(const void *blob,
 extern int of_fdt_match(const void *blob, unsigned long node,
 			const char *const *compat);
 extern void of_fdt_unflatten_tree(const unsigned long *blob,
+			       struct device_node *dad,
 			       struct device_node **mynodes);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
-- 
cgit v1.2.3


From 83262418b0ef8bda66eca7c72d4c24ae6f7b230e Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 3 May 2016 23:22:51 +1000
Subject: drivers/of: Return allocated memory from of_fdt_unflatten_tree()

This returns the allocate memory chunk, storing the unflattened device
tree, from of_fdt_unflatten_tree() so that memory chunk can be released
on demand in PowerNV PCI hotplug driver.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/fdt.c       | 33 ++++++++++++++++++++++-----------
 include/linux/of_fdt.h |  6 +++---
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 0d53687d4b8c..ef1ccee51af8 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -454,11 +454,14 @@ static int unflatten_dt_nodes(const void *blob,
  * @mynodes: The device_node tree created by the call
  * @dt_alloc: An allocator that provides a virtual address to memory
  * for the resulting tree
+ *
+ * Returns NULL on failure or the memory chunk containing the unflattened
+ * device tree on success.
  */
-static void __unflatten_device_tree(const void *blob,
-			     struct device_node *dad,
-			     struct device_node **mynodes,
-			     void * (*dt_alloc)(u64 size, u64 align))
+static void *__unflatten_device_tree(const void *blob,
+				     struct device_node *dad,
+				     struct device_node **mynodes,
+				     void *(*dt_alloc)(u64 size, u64 align))
 {
 	int size;
 	void *mem;
@@ -467,7 +470,7 @@ static void __unflatten_device_tree(const void *blob,
 
 	if (!blob) {
 		pr_debug("No device tree pointer\n");
-		return;
+		return NULL;
 	}
 
 	pr_debug("Unflattening device tree:\n");
@@ -477,13 +480,13 @@ static void __unflatten_device_tree(const void *blob,
 
 	if (fdt_check_header(blob)) {
 		pr_err("Invalid device tree blob header\n");
-		return;
+		return NULL;
 	}
 
 	/* First pass, scan for size */
 	size = unflatten_dt_nodes(blob, NULL, dad, NULL);
 	if (size < 0)
-		return;
+		return NULL;
 
 	size = ALIGN(size, 4);
 	pr_debug("  size is %d, allocating...\n", size);
@@ -503,6 +506,7 @@ static void __unflatten_device_tree(const void *blob,
 			   be32_to_cpup(mem + size));
 
 	pr_debug(" <- unflatten_device_tree()\n");
+	return mem;
 }
 
 static void *kernel_tree_alloc(u64 size, u64 align)
@@ -522,14 +526,21 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
  * tree of struct device_node. It also fills the "name" and "type"
  * pointers of the nodes so the normal device-tree walking functions
  * can be used.
+ *
+ * Returns NULL on failure or the memory chunk containing the unflattened
+ * device tree on success.
  */
-void of_fdt_unflatten_tree(const unsigned long *blob,
-			struct device_node *dad,
-			struct device_node **mynodes)
+void *of_fdt_unflatten_tree(const unsigned long *blob,
+			    struct device_node *dad,
+			    struct device_node **mynodes)
 {
+	void *mem;
+
 	mutex_lock(&of_fdt_unflatten_mutex);
-	__unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
+	mem = __unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
 	mutex_unlock(&of_fdt_unflatten_mutex);
+
+	return mem;
 }
 EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
 
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 1bffcbd56f8e..901ec01c9fba 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -37,9 +37,9 @@ extern bool of_fdt_is_big_endian(const void *blob,
 				 unsigned long node);
 extern int of_fdt_match(const void *blob, unsigned long node,
 			const char *const *compat);
-extern void of_fdt_unflatten_tree(const unsigned long *blob,
-			       struct device_node *dad,
-			       struct device_node **mynodes);
+extern void *of_fdt_unflatten_tree(const unsigned long *blob,
+				   struct device_node *dad,
+				   struct device_node **mynodes);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
 extern int __initdata dt_root_addr_cells;
-- 
cgit v1.2.3


From 92efb1bd9bcbdf83cc0e6cfead752d0c82f63677 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 16 May 2016 15:12:02 -0500
Subject: PCI: Identify Enhanced Allocation (EA) BAR Equivalent resources in
 sysfs

Resource flags are exposed to userspace via the sysfs "resource" file.
lspci reads the sysfs file to determine resource properties.

Add a "BAR Equivalent Indicator" flag so lspci can distinguish between
[virtual] and [enhanced] resources.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Sean O. Stalley <sean.stalley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c      | 2 +-
 include/linux/ioport.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..7b008c7cac35 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2228,7 +2228,7 @@ void pci_pm_init(struct pci_dev *dev)
 
 static unsigned long pci_ea_flags(struct pci_dev *dev, u8 prop)
 {
-	unsigned long flags = IORESOURCE_PCI_FIXED;
+	unsigned long flags = IORESOURCE_PCI_FIXED | IORESOURCE_PCI_EA_BEI;
 
 	switch (prop) {
 	case PCI_EA_P_MEM:
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 0b65543dc6cf..6230064d7f95 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -26,6 +26,9 @@ struct resource {
 
 /*
  * IO resources have these defined flags.
+ *
+ * PCI devices expose these flags to userspace in the "resource" sysfs file,
+ * so don't move them.
  */
 #define IORESOURCE_BITS		0x000000ff	/* Bus-specific bits */
 
@@ -110,6 +113,7 @@ struct resource {
 
 /* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
 #define IORESOURCE_PCI_FIXED		(1<<4)	/* Do not move resource */
+#define IORESOURCE_PCI_EA_BEI		(1<<5)	/* BAR Equivalent Indicator */
 
 /*
  * I/O Resource Descriptors
-- 
cgit v1.2.3


From e4b2749158631e6d74bf14d2ef3804d780e2f770 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 11 May 2016 11:58:47 +0200
Subject: DAX: move RADIX_DAX_ definitions to dax.c

These don't belong in radix-tree.c any more than PAGECACHE_TAG_* do.
Let's try to maintain the idea that radix-tree simply implements an
abstract data type.

Acked-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/dax.c                   | 9 +++++++++
 include/linux/radix-tree.h | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 2494255c5785..7ef5aef78442 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,15 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..d08d6ec3bf53 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -52,15 +52,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
-- 
cgit v1.2.3


From 02fbd139759feb1f331cebd858523b5d774082e6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 May 2016 11:58:48 +0200
Subject: dax: Remove complete_unwritten argument

Fault handlers currently take complete_unwritten argument to convert
unwritten extents after PTEs are updated. However no filesystem uses
this anymore as the code is racy. Remove the unused argument.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/block_dev.c      |  4 ++--
 fs/dax.c            | 43 +++++++++----------------------------------
 fs/ext2/file.c      |  4 ++--
 fs/ext4/file.c      |  4 ++--
 fs/xfs/xfs_file.c   |  7 +++----
 include/linux/dax.h | 17 +++++++----------
 include/linux/fs.h  |  1 -
 7 files changed, 25 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..b25bb230b28a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1746,7 +1746,7 @@ static const struct address_space_operations def_blk_aops = {
  */
 static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+	return __dax_fault(vma, vmf, blkdev_get_block);
 }
 
 static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
@@ -1758,7 +1758,7 @@ static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
 static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, unsigned int flags)
 {
-	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block);
 }
 
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
diff --git a/fs/dax.c b/fs/dax.c
index 7ef5aef78442..83e7894d86d8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -612,19 +612,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- *	to written so the data written to them is exposed. This is required for
- *	required by write faults for filesystems that will return unwritten
- *	extent mappings from @get_block, but it is optional for reads as
- *	dax_insert_mapping() will always zero unwritten blocks. If the fs does
- *	not support unwritten extents, the it should pass NULL.
  *
  * When a page fault occurs, filesystems may call this helper in their
  * fault handler for DAX files. __dax_fault() assumes the caller has done all
  * the necessary locking for the page fault to proceed successfully.
  */
 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block, dax_iodone_t complete_unwritten)
+			get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -727,23 +721,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		page = NULL;
 	}
 
-	/*
-	 * If we successfully insert the new mapping over an unwritten extent,
-	 * we need to ensure we convert the unwritten extent. If there is an
-	 * error inserting the mapping, the filesystem needs to leave it as
-	 * unwritten to prevent exposure of the stale underlying data to
-	 * userspace, but we still need to call the completion function so
-	 * the private resources on the mapping buffer can be released. We
-	 * indicate what the callback should do via the uptodate variable, same
-	 * as for normal BH based IO completions.
-	 */
+	/* Filesystem should not return unwritten buffers to us! */
+	WARN_ON_ONCE(buffer_unwritten(&bh));
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
-	if (buffer_unwritten(&bh)) {
-		if (complete_unwritten)
-			complete_unwritten(&bh, !error);
-		else
-			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
-	}
 
  out:
 	if (error == -ENOMEM)
@@ -772,7 +752,7 @@ EXPORT_SYMBOL(__dax_fault);
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-	      get_block_t get_block, dax_iodone_t complete_unwritten)
+	      get_block_t get_block)
 {
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +761,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+	result = __dax_fault(vma, vmf, get_block);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
@@ -815,8 +795,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
 
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, get_block_t get_block,
-		dax_iodone_t complete_unwritten)
+		pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -875,6 +854,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		if (get_block(inode, block, &bh, 1) != 0)
 			return VM_FAULT_SIGBUS;
 		alloc = true;
+		WARN_ON_ONCE(buffer_unwritten(&bh));
 	}
 
 	bdev = bh.b_bdev;
@@ -1020,9 +1000,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  out:
 	i_mmap_unlock_read(mapping);
 
-	if (buffer_unwritten(&bh))
-		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
 	return result;
 
  fallback:
@@ -1042,8 +1019,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  * pmd_fault handler for DAX files.
  */
 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-			pmd_t *pmd, unsigned int flags, get_block_t get_block,
-			dax_iodone_t complete_unwritten)
+			pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +1028,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
-				complete_unwritten);
+	result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
 	if (flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+	ret = __dax_fault(vma, vmf, ext2_get_block);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	}
 	down_read(&ei->dax_sem);
 
-	ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
 
 	up_read(&ei->dax_sem);
 	if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 37e28082885a..7a6398867ff2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
-		result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
+		result = __dax_fault(vma, vmf, ext4_dax_get_block);
 
 	if (write) {
 		if (!IS_ERR(handle))
@@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		result = VM_FAULT_SIGBUS;
 	else
 		result = __dax_pmd_fault(vma, addr, pmd, flags,
-					 ext4_dax_get_block, NULL);
+					 ext4_dax_get_block);
 
 	if (write) {
 		if (!IS_ERR(handle))
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 569938a4a357..c2946f436a3a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1558,7 +1558,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
 	} else {
 		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
 		ret = block_page_mkwrite_return(ret);
@@ -1592,7 +1592,7 @@ xfs_filemap_fault(
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
-		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1629,8 +1629,7 @@ xfs_filemap_pmd_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
-			      NULL);
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 636dd59ab505..7c45ac7ea1d1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -10,10 +10,8 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-		dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-		dax_iodone_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
@@ -27,21 +25,20 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-				unsigned int flags, get_block_t, dax_iodone_t);
+				unsigned int flags, get_block_t);
 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-				unsigned int flags, get_block_t, dax_iodone_t);
+				unsigned int flags, get_block_t);
 #else
 static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-				pmd_t *pmd, unsigned int flags, get_block_t gb,
-				dax_iodone_t di)
+				pmd_t *pmd, unsigned int flags, get_block_t gb)
 {
 	return VM_FAULT_FALLBACK;
 }
 #define __dax_pmd_fault dax_pmd_fault
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
+#define __dax_mkwrite(vma, vmf, gb)	__dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..9f2813090d1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
-typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 
 #define MAY_EXEC		0x00000001
 #define MAY_WRITE		0x00000002
-- 
cgit v1.2.3


From cfbcf468454ab4b20f0b4b62da51920b99fdb19e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 Apr 2016 12:30:53 -0300
Subject: perf core: Pass max stack as a perf_callchain_entry context

This makes perf_callchain_{user,kernel}() receive the max stack
as context for the perf_callchain_entry, instead of accessing
the global sysctl_perf_event_max_stack.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/arc/kernel/perf_event.c       |  6 +++---
 arch/arm/kernel/perf_callchain.c   | 10 +++++-----
 arch/arm64/kernel/perf_callchain.c | 14 +++++++-------
 arch/metag/kernel/perf_callchain.c | 10 +++++-----
 arch/mips/kernel/perf_event.c      | 12 ++++++------
 arch/powerpc/perf/callchain.c      | 14 +++++++-------
 arch/s390/kernel/perf_event.c      |  4 ++--
 arch/sh/kernel/perf_callchain.c    |  4 ++--
 arch/sparc/kernel/perf_event.c     | 14 +++++++-------
 arch/tile/kernel/perf_event.c      |  6 +++---
 arch/x86/events/core.c             | 14 +++++++-------
 arch/xtensa/kernel/perf_event.c    | 10 +++++-----
 include/linux/perf_event.h         | 16 +++++++++++-----
 kernel/bpf/stackmap.c              |  3 ++-
 kernel/events/callchain.c          | 20 ++++++++++++--------
 15 files changed, 84 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 8b134cfe5e1f..6fd48021324b 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -48,7 +48,7 @@ struct arc_callchain_trace {
 static int callchain_trace(unsigned int addr, void *data)
 {
 	struct arc_callchain_trace *ctrl = data;
-	struct perf_callchain_entry *entry = ctrl->perf_stuff;
+	struct perf_callchain_entry_ctx *entry = ctrl->perf_stuff;
 	perf_callchain_store(entry, addr);
 
 	if (ctrl->depth++ < 3)
@@ -58,7 +58,7 @@ static int callchain_trace(unsigned int addr, void *data)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct arc_callchain_trace ctrl = {
 		.depth = 0,
@@ -69,7 +69,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	/*
 	 * User stack can't be unwound trivially with kernel dwarf unwinder
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 27563befa8a2..bc552e813e7b 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct frame_tail buftail;
 	unsigned long err;
@@ -59,7 +59,7 @@ user_backtrace(struct frame_tail __user *tail,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct frame_tail __user *tail;
 
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < sysctl_perf_event_max_stack) &&
+	while ((entry->entry->nr < entry->max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
@@ -89,13 +89,13 @@ static int
 callchain_trace(struct stackframe *fr,
 		void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 32c3c6e70119..0d60150057cf 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct frame_tail buftail;
 	unsigned long err;
@@ -76,7 +76,7 @@ struct compat_frame_tail {
 
 static struct compat_frame_tail __user *
 compat_user_backtrace(struct compat_frame_tail __user *tail,
-		      struct perf_callchain_entry *entry)
+		      struct perf_callchain_entry_ctx *entry)
 {
 	struct compat_frame_tail buftail;
 	unsigned long err;
@@ -106,7 +106,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 }
 #endif /* CONFIG_COMPAT */
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < sysctl_perf_event_max_stack &&
+		while (entry->entry->nr < entry->max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < sysctl_perf_event_max_stack) &&
+		while ((entry->entry->nr < entry->max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
@@ -146,12 +146,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
  */
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, frame->pc);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	struct stackframe frame;
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 252abc12a5a3..b3261a98b15b 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -29,7 +29,7 @@ static bool is_valid_call(unsigned long calladdr)
 
 static struct metag_frame __user *
 user_backtrace(struct metag_frame __user *user_frame,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct metag_frame frame;
 	unsigned long calladdr;
@@ -56,7 +56,7 @@ user_backtrace(struct metag_frame __user *user_frame,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	unsigned long sp = regs->ctx.AX[0].U0;
 	struct metag_frame __user *frame;
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
+	while ((entry->entry->nr < entry->max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
@@ -78,13 +78,13 @@ static int
 callchain_trace(struct stackframe *fr,
 		void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index 5021c546ad07..22395c7d7030 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -25,8 +25,8 @@
  * the user stack callchains, we will add it here.
  */
 
-static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
-	unsigned long reg29)
+static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
+				    unsigned long reg29)
 {
 	unsigned long *sp = (unsigned long *)reg29;
 	unsigned long addr;
@@ -35,14 +35,14 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= sysctl_perf_event_max_stack)
+			if (entry->entry->nr >= entry->max_stack)
 				break;
 		}
 	}
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
-		      struct pt_regs *regs)
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
+			   struct pt_regs *regs)
 {
 	unsigned long sp = regs->regs[29];
 #ifdef CONFIG_KALLSYMS
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= sysctl_perf_event_max_stack)
+		if (entry->entry->nr >= entry->max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 22d9015c1acc..c9260c1dfdbc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
 	unsigned long next_ip;
@@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp)
 		puc == (unsigned long) &sf->uc;
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
 	return rc;
 }
 
-static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 					  struct pt_regs *regs)
 {
 }
@@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
 	return mctx->mc_gregs;
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned int sp, next_sp;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
@@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	if (current_is_64bit())
 		perf_callchain_user_64(entry, regs);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index c3e4099b60a5..87035fa58bbe 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register);
 
 static int __perf_callchain_kernel(void *data, unsigned long address)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	perf_callchain_store(entry, address);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	if (user_mode(regs))
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c
index cc80b614b5fa..fa2c0cd23eaa 100644
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -21,7 +21,7 @@ static int callchain_stack(void *data, char *name)
 
 static void callchain_address(void *data, unsigned long addr, int reliable)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	if (reliable)
 		perf_callchain_store(entry, addr);
@@ -33,7 +33,7 @@ static const struct stacktrace_ops callchain_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	perf_callchain_store(entry, regs->pc);
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index a4b8b5aed21c..bcc5376db74b 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1711,7 +1711,7 @@ static int __init init_hw_perf_events(void)
 }
 pure_initcall(init_hw_perf_events);
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	unsigned long ksp, fp;
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1769,7 +1769,7 @@ valid_user_frame(const void __user *fp, unsigned long size)
 	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long ufp;
@@ -1790,10 +1790,10 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long ufp;
@@ -1822,11 +1822,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	u64 saved_fault_address = current_thread_info()->fault_address;
 	u8 saved_fault_code = get_thread_fault_code();
diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c
index 8767060d70fb..6394c1ccb68e 100644
--- a/arch/tile/kernel/perf_event.c
+++ b/arch/tile/kernel/perf_event.c
@@ -941,7 +941,7 @@ arch_initcall(init_hw_perf_events);
 /*
  * Tile specific backtracing code for perf_events.
  */
-static inline void perf_callchain(struct perf_callchain_entry *entry,
+static inline void perf_callchain(struct perf_callchain_entry_ctx *entry,
 		    struct pt_regs *regs)
 {
 	struct KBacktraceIterator kbt;
@@ -992,13 +992,13 @@ static inline void perf_callchain(struct perf_callchain_entry *entry,
 	}
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		    struct pt_regs *regs)
 {
 	perf_callchain(entry, regs);
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 		      struct pt_regs *regs)
 {
 	perf_callchain(entry, regs);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5e5e76a52f58..07f2b01cfb72 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name)
 
 static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	return perf_callchain_store(entry, addr);
 }
@@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
@@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment)
 #include <asm/compat.h>
 
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
 	unsigned long ss_base, cs_base;
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #else
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
     return 0;
 }
 #endif
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
 	const void __user *fp;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index a6b00b3af429..ef90479e0397 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -323,23 +323,23 @@ static void xtensa_pmu_read(struct perf_event *event)
 
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	perf_callchain_store(entry, frame->pc);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
+	xtensa_backtrace_kernel(regs, entry->max_stack,
 				callchain_trace, NULL, entry);
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
+	xtensa_backtrace_user(regs, entry->max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9e1c3ada91c4..dbd18246b36e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -61,6 +61,11 @@ struct perf_callchain_entry {
 	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
+struct perf_callchain_entry_ctx {
+	struct perf_callchain_entry *entry;
+	u32			    max_stack;
+};
+
 struct perf_raw_record {
 	u32				size;
 	void				*data;
@@ -1063,19 +1068,20 @@ extern void perf_event_fork(struct task_struct *tsk);
 /* Callchains */
 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
-extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
-extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark);
+		   u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
 
-static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	if (entry->nr < sysctl_perf_event_max_stack) {
+	struct perf_callchain_entry *entry = ctx->entry;
+	if (entry->nr < ctx->max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f5a19548be12..a82d7605db3f 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -136,7 +136,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 7fc89939ede9..af95ad92893a 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -32,12 +32,12 @@ static DEFINE_MUTEX(callchain_mutex);
 static struct callchain_cpus_entries *callchain_cpus_entries;
 
 
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 				  struct pt_regs *regs)
 {
 }
 
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 				struct pt_regs *regs)
 {
 }
@@ -176,14 +176,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return NULL;
 
-	return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark)
 {
 	struct perf_callchain_entry *entry;
+	struct perf_callchain_entry_ctx ctx;
 	int rctx;
 
 	entry = get_callchain_entry(&rctx);
@@ -193,12 +194,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	if (!entry)
 		goto exit_put;
 
+	ctx.entry     = entry;
+	ctx.max_stack = max_stack;
+
 	entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(entry, regs);
+			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+		perf_callchain_kernel(&ctx, regs);
 	}
 
 	if (user) {
@@ -214,8 +218,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(entry, PERF_CONTEXT_USER);
-			perf_callchain_user(entry, regs);
+				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+			perf_callchain_user(&ctx, regs);
 		}
 	}
 
-- 
cgit v1.2.3


From 3b1fff08038bd0792b1aa1e9703b2dd0512a3fd0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 May 2016 18:08:32 -0300
Subject: perf core: Add a 'nr' field to perf_event_callchain_context

We will use it to count how many addresses are in the entry->ip[] array,
excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really
return the number of entries specified by the user via the relevant
sysctl, kernel.perf_event_max_contexts, or via the per event
perf_event_attr.sample_max_stack knob.

This way we keep the perf_sample->ip_callchain->nr meaning, that is the
number of entries, be it real addresses or PERF_CONTEXT_ entries, while
honouring the max_stack knobs, i.e. the end result will be max_stack
entries if we have at least that many entries in a given stack trace.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/arm/kernel/perf_callchain.c   | 2 +-
 arch/arm64/kernel/perf_callchain.c | 4 ++--
 arch/metag/kernel/perf_callchain.c | 2 +-
 arch/mips/kernel/perf_event.c      | 4 ++--
 arch/powerpc/perf/callchain.c      | 4 ++--
 arch/sparc/kernel/perf_event.c     | 6 +++---
 arch/x86/events/core.c             | 4 ++--
 include/linux/perf_event.h         | 6 ++++--
 kernel/events/callchain.c          | 3 +--
 9 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index bc552e813e7b..22bf1f64d99a 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->entry->nr < entry->max_stack) &&
+	while ((entry->nr < entry->max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 0d60150057cf..713ca824f266 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->entry->nr < entry->max_stack &&
+		while (entry->nr < entry->max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->entry->nr < entry->max_stack) &&
+		while ((entry->nr < entry->max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index b3261a98b15b..3e8e048040df 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	--frame;
 
-	while ((entry->entry->nr < entry->max_stack) && frame)
+	while ((entry->nr < entry->max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index 22395c7d7030..d64056e0bb56 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->entry->nr >= entry->max_stack)
+			if (entry->nr >= entry->max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->entry->nr >= entry->max_stack)
+		if (entry->nr >= entry->max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index c9260c1dfdbc..f68f213dc36c 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index bcc5376db74b..710f3278d448 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			}
 		}
 #endif
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 07f2b01cfb72..5de96a18cd9c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 		return;
 
 	pagefault_disable();
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dbd18246b36e..3803bb1a862b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -64,6 +64,7 @@ struct perf_callchain_entry {
 struct perf_callchain_entry_ctx {
 	struct perf_callchain_entry *entry;
 	u32			    max_stack;
+	u32			    nr;
 };
 
 struct perf_raw_record {
@@ -1080,9 +1081,10 @@ extern int sysctl_perf_event_max_stack;
 
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	struct perf_callchain_entry *entry = ctx->entry;
-	if (entry->nr < ctx->max_stack) {
+	if (ctx->nr < ctx->max_stack) {
+		struct perf_callchain_entry *entry = ctx->entry;
 		entry->ip[entry->nr++] = ip;
+		++ctx->nr;
 		return 0;
 	} else {
 		return -1; /* no more room, stop walking the stack */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index af95ad92893a..8774ff86debb 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -196,8 +196,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
-
-	entry->nr = init_nr;
+	ctx.nr	      = entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-- 
cgit v1.2.3


From 3e4de4ec4cfea40994b47a79767610153edbf45b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:01:50 -0300
Subject: perf core: Add perf_callchain_store_context() helper

We need have different helpers to account how many contexts we have in
the sample and for real addresses, so do it now as a prep patch, to
ease review.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-q964tnyuqrxw5gld18vizs3c@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/powerpc/perf/callchain.c | 6 +++---
 include/linux/perf_event.h    | 2 ++
 kernel/events/callchain.c     | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index f68f213dc36c..f62597dbd757 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 			next_ip = regs->nip;
 			lr = regs->link;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL);
 
 		} else {
 			if (level == 0)
@@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 			    read_user_stack_64(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store_context(entry, PERF_CONTEXT_USER);
 			perf_callchain_store(entry, next_ip);
 			continue;
 		}
@@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 			    read_user_stack_32(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store_context(entry, PERF_CONTEXT_USER);
 			perf_callchain_store(entry, next_ip);
 			continue;
 		}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3803bb1a862b..2024b14cc2b1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1079,6 +1079,8 @@ extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
 
+#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context)
+
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
 	if (ctx->nr < ctx->max_stack) {
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 8774ff86debb..ca645736a983 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -200,7 +200,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
 		perf_callchain_kernel(&ctx, regs);
 	}
 
@@ -217,7 +217,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 			perf_callchain_user(&ctx, regs);
 		}
 	}
-- 
cgit v1.2.3


From c85b03349640b34f3545503c8429fc43005e9a92 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:06:21 -0300
Subject: perf core: Separate accounting of contexts and real addresses in a
 stack trace

The perf_sample->ip_callchain->nr value includes all the entries in the
ip_callchain->ip[] array, real addresses and PERF_CONTEXT_{KERNEL,USER,etc},
while what the user expects is that what is in the kernel.perf_event_max_stack
sysctl or in the upcoming per event perf_event_attr.sample_max_stack knob be
honoured in terms of IP addresses in the stack trace.

So allocate a bunch of extra entries for contexts, and do the accounting
via perf_callchain_entry_ctx struct members.

A new sysctl, kernel.perf_event_max_contexts_per_stack is also
introduced for investigating possible bugs in the callchain
implementation by some arch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-3b4wnqk340c4sg4gwkfdi9yk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 include/linux/perf_event.h      | 18 ++++++++++++++++--
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/callchain.c       | 10 +++++++++-
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index daabdd7ee543..a3683ce2a2f3 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
+- perf_event_max_contexts_per_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -668,6 +669,19 @@ The default value is 127.
 
 ==============================================================
 
+perf_event_max_contexts_per_stack:
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2024b14cc2b1..6b87be908790 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -65,6 +65,8 @@ struct perf_callchain_entry_ctx {
 	struct perf_callchain_entry *entry;
 	u32			    max_stack;
 	u32			    nr;
+	short			    contexts;
+	bool			    contexts_maxed;
 };
 
 struct perf_raw_record {
@@ -1078,12 +1080,24 @@ extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
 
-#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context)
+static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
+{
+	if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
+		struct perf_callchain_entry *entry = ctx->entry;
+		entry->ip[entry->nr++] = ip;
+		++ctx->contexts;
+		return 0;
+	} else {
+		ctx->contexts_maxed = true;
+		return -1; /* no more room, stop walking the stack */
+	}
+}
 
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	if (ctx->nr < ctx->max_stack) {
+	if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
 		struct perf_callchain_entry *entry = ctx->entry;
 		entry->ip[entry->nr++] = ip;
 		++ctx->nr;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 43fc8d213472..36ce552cf6a9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -862,6 +862,7 @@ enum perf_event_type {
 };
 
 #define PERF_MAX_STACK_DEPTH		127
+#define PERF_MAX_CONTEXTS_PER_STACK	  8
 
 enum perf_callchain_context {
 	PERF_CONTEXT_HV			= (__u64)-32,
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index ca645736a983..179ef4640964 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
 };
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
 
 static inline size_t perf_callchain_entry__sizeof(void)
 {
 	return (sizeof(struct perf_callchain_entry) +
-		sizeof(__u64) * sysctl_perf_event_max_stack);
+		sizeof(__u64) * (sysctl_perf_event_max_stack +
+				 sysctl_perf_event_max_contexts_per_stack));
 }
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -197,6 +199,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
 	ctx.nr	      = entry->nr = init_nr;
+	ctx.contexts       = 0;
+	ctx.contexts_maxed = false;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
@@ -228,6 +232,10 @@ exit_put:
 	return entry;
 }
 
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ec6907a16b3..bec4c11c47d6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1156,6 +1156,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &six_hundred_forty_kb,
 	},
+	{
+		.procname	= "perf_event_max_contexts_per_stack",
+		.data		= &sysctl_perf_event_max_contexts_per_stack,
+		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &one_thousand,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
cgit v1.2.3


From 2af3a8159cd204fc8437ed2f75863f0fb930f0d0 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 10 May 2016 10:23:52 -0600
Subject: block: Add vfs_msg() interface

In preparation of moving DAX capability checks to the block layer
from filesystem code, add a VFS message interface that aligns with
filesystem's message format.

For instance, a vfs_msg() message followed by XFS messages in case
of a dax mount error may look like:

  VFS (pmem0p1): error: unaligned partition for dax
  XFS (pmem0p1): DAX unsupported by block device. Turning off DAX.
  XFS (pmem0p1): Mounting V5 Filesystem
   :

vfs_msg() is largely based on ext4_msg().

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/block_dev.c         | 12 ++++++++++++
 include/linux/blkdev.h | 11 +++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b25bb230b28a..91e0ec0233c0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,6 +50,18 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+	va_end(args);
+}
+
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = bdev->bd_inode;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 669e419d6234..78c48ab22f46 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -767,6 +767,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 }
 #endif
 
+#ifdef CONFIG_PRINTK
+#define vfs_msg(sb, level, fmt, ...)				\
+	__vfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#else
+#define vfs_msg(sb, level, fmt, ...)				\
+do {								\
+	no_printk(fmt, ##__VA_ARGS__);				\
+	__vfs_msg(sb, "", " ");					\
+} while (0)
+#endif
+
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
-- 
cgit v1.2.3


From 2d96afc8f70ef86c66a0b5d80c24a27d6dd13df3 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 10 May 2016 10:23:53 -0600
Subject: block: Add bdev_dax_supported() for dax mount checks

DAX imposes additional requirements to a device.  Add
bdev_dax_supported() which performs all the precondition checks
necessary for filesystem to mount the device with dax option.

Also add a new check to verify if a partition is aligned by 4KB.
When a partition is unaligned, any dax read/write access fails,
except for metadata update.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/block_dev.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 91e0ec0233c0..518cde62c01c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -509,6 +509,51 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
 
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	struct blk_dax_ctl dax = {
+		.sector = 0,
+		.size = PAGE_SIZE,
+	};
+	int err;
+
+	if (blocksize != PAGE_SIZE) {
+		vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+		return -EINVAL;
+	}
+
+	err = bdev_direct_access(sb->s_bdev, &dax);
+	if (err < 0) {
+		switch (err) {
+		case -EOPNOTSUPP:
+			vfs_msg(sb, KERN_ERR,
+				"error: device does not support dax");
+			break;
+		case -EINVAL:
+			vfs_msg(sb, KERN_ERR,
+				"error: unaligned partition for dax");
+			break;
+		default:
+			vfs_msg(sb, KERN_ERR,
+				"error: dax access failed (%d)", err);
+		}
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
 /*
  * pseudo-fs
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78c48ab22f46..71231a55debd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1688,6 +1688,7 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
+extern int bdev_dax_supported(struct super_block *, int);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From a8078b1fc616da6112eb95f0063cd34531d4ccf0 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 10 May 2016 10:23:57 -0600
Subject: block: Update blkdev_dax_capable() for consistency

blkdev_dax_capable() is similar to bdev_dax_supported(), but needs
to remain as a separate interface for checking dax capability of
a raw block device.

Rename and relocate blkdev_dax_capable() to keep them maintained
consistently, and call bdev_direct_access() for the dax capability
check.

There is no change in the behavior.

Link: https://lkml.org/lkml/2016/5/9/950
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 block/ioctl.c          | 30 ------------------------------
 fs/block_dev.c         | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/blkdev.h |  1 +
 include/linux/fs.h     |  8 --------
 4 files changed, 38 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92f89ca..7eeda072dc70 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,7 +4,6 @@
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
-#include <linux/badblocks.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
@@ -407,35 +406,6 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (!disk->fops->direct_access)
-		return false;
-
-	/*
-	 * If the partition is not aligned on a page boundary, we can't
-	 * do dax I/O to it.
-	 */
-	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-		return false;
-
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
-	return true;
-}
-#endif
-
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 518cde62c01c..8477d4501b1e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
+#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -554,6 +555,40 @@ int bdev_dax_supported(struct super_block *sb, int blocksize)
 }
 EXPORT_SYMBOL_GPL(bdev_dax_supported);
 
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct blk_dax_ctl dax = {
+		.size = PAGE_SIZE,
+	};
+
+	if (!IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+
+	dax.sector = 0;
+	if (bdev_direct_access(bdev, &dax) < 0)
+		return false;
+
+	dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+	if (bdev_direct_access(bdev, &dax) < 0)
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+
 /*
  * pseudo-fs
  */
@@ -1295,7 +1330,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
 			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-				if (!blkdev_dax_capable(bdev))
+				if (!bdev_dax_capable(bdev))
 					bdev->bd_inode->i_flags &= ~S_DAX;
 			}
 
@@ -1332,7 +1367,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-			if (!blkdev_dax_capable(bdev))
+			if (!bdev_dax_capable(bdev))
 				bdev->bd_inode->i_flags &= ~S_DAX;
 		}
 	} else {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 71231a55debd..27cbefe8c985 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1689,6 +1689,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
+extern bool bdev_dax_capable(struct block_device *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9f2813090d1b..17f934fcf564 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2319,14 +2319,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
 
 extern struct super_block *blockdev_superblock;
 
-- 
cgit v1.2.3


From 459a25afe97cb3d7f978b90c881f4d7aac8fb755 Mon Sep 17 00:00:00 2001
From: Boris BREZILLON <boris.brezillon@free-electrons.com>
Date: Wed, 30 Mar 2016 22:03:27 +0200
Subject: pwm: Get rid of pwm->lock

PWM devices are not protected against concurrent accesses. The lock in
struct pwm_device might let PWM users think it is, but it's actually
only protecting the enabled state.

Removing this lock should be fine as long as all PWM users are aware
that accesses to the PWM device have to be serialized, which seems to be
the case for all of them except the sysfs interface. Patch the sysfs
code by adding a lock to the pwm_export struct and making sure it's
taken for all relevant accesses to the exported PWM device.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 19 ++++---------------
 drivers/pwm/sysfs.c | 24 ++++++++++++++++++++----
 include/linux/pwm.h |  2 --
 3 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 22cf3959041c..cb762cf51332 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -269,7 +269,6 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
 		pwm->polarity = polarity;
-		mutex_init(&pwm->lock);
 
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
@@ -474,22 +473,16 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 	if (!pwm->chip->ops->set_polarity)
 		return -ENOSYS;
 
-	mutex_lock(&pwm->lock);
-
-	if (pwm_is_enabled(pwm)) {
-		err = -EBUSY;
-		goto unlock;
-	}
+	if (pwm_is_enabled(pwm))
+		return -EBUSY;
 
 	err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
 	if (err)
-		goto unlock;
+		return err;
 
 	pwm->polarity = polarity;
 
-unlock:
-	mutex_unlock(&pwm->lock);
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pwm_set_polarity);
 
@@ -506,16 +499,12 @@ int pwm_enable(struct pwm_device *pwm)
 	if (!pwm)
 		return -EINVAL;
 
-	mutex_lock(&pwm->lock);
-
 	if (!test_and_set_bit(PWMF_ENABLED, &pwm->flags)) {
 		err = pwm->chip->ops->enable(pwm->chip, pwm);
 		if (err)
 			clear_bit(PWMF_ENABLED, &pwm->flags);
 	}
 
-	mutex_unlock(&pwm->lock);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(pwm_enable);
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 9c90886f4123..187ca0875cf6 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -26,6 +26,7 @@
 struct pwm_export {
 	struct device child;
 	struct pwm_device *pwm;
+	struct mutex lock;
 };
 
 static struct pwm_export *child_to_pwm_export(struct device *child)
@@ -53,7 +54,8 @@ static ssize_t period_store(struct device *child,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	unsigned int val;
 	int ret;
 
@@ -61,7 +63,9 @@ static ssize_t period_store(struct device *child,
 	if (ret)
 		return ret;
 
+	mutex_lock(&export->lock);
 	ret = pwm_config(pwm, pwm_get_duty_cycle(pwm), val);
+	mutex_unlock(&export->lock);
 
 	return ret ? : size;
 }
@@ -79,7 +83,8 @@ static ssize_t duty_cycle_store(struct device *child,
 				struct device_attribute *attr,
 				const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	unsigned int val;
 	int ret;
 
@@ -87,7 +92,9 @@ static ssize_t duty_cycle_store(struct device *child,
 	if (ret)
 		return ret;
 
+	mutex_lock(&export->lock);
 	ret = pwm_config(pwm, val, pwm_get_period(pwm));
+	mutex_unlock(&export->lock);
 
 	return ret ? : size;
 }
@@ -105,13 +112,16 @@ static ssize_t enable_store(struct device *child,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	int val, ret;
 
 	ret = kstrtoint(buf, 0, &val);
 	if (ret)
 		return ret;
 
+	mutex_lock(&export->lock);
+
 	switch (val) {
 	case 0:
 		pwm_disable(pwm);
@@ -124,6 +134,8 @@ static ssize_t enable_store(struct device *child,
 		break;
 	}
 
+	mutex_unlock(&export->lock);
+
 	return ret ? : size;
 }
 
@@ -151,7 +163,8 @@ static ssize_t polarity_store(struct device *child,
 			      struct device_attribute *attr,
 			      const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	enum pwm_polarity polarity;
 	int ret;
 
@@ -162,7 +175,9 @@ static ssize_t polarity_store(struct device *child,
 	else
 		return -EINVAL;
 
+	mutex_lock(&export->lock);
 	ret = pwm_set_polarity(pwm, polarity);
+	mutex_unlock(&export->lock);
 
 	return ret ? : size;
 }
@@ -203,6 +218,7 @@ static int pwm_export_child(struct device *parent, struct pwm_device *pwm)
 	}
 
 	export->pwm = pwm;
+	mutex_init(&export->lock);
 
 	export->child.release = pwm_export_release;
 	export->child.parent = parent;
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index b78d27c42629..d2e7430ccedb 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -106,7 +106,6 @@ enum {
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @chip_data: chip-private data associated with the PWM device
- * @lock: used to serialize accesses to the PWM device where necessary
  * @period: period of the PWM signal (in nanoseconds)
  * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
  * @polarity: polarity of the PWM signal
@@ -119,7 +118,6 @@ struct pwm_device {
 	unsigned int pwm;
 	struct pwm_chip *chip;
 	void *chip_data;
-	struct mutex lock;
 
 	unsigned int period;
 	unsigned int duty_cycle;
-- 
cgit v1.2.3


From a8c3862551e063344f80c3e05d595f9d8836f355 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:37 +0200
Subject: pwm: Keep PWM state in sync with hardware state

Before the introduction of pwm_args, the core was resetting the PWM
period and polarity states to the reference values (those provided
through the DT, a PWM lookup table or hardcoded in the driver).

Now that all PWM users are correctly using pwm_args to configure their
PWM device, we can safely remove the pwm_apply_args() call in pwm_get()
and of_pwm_get().

We can also get rid of the pwm_set_period() call in pwm_apply_args(),
because PWM users are now directly using pargs->period instead of
pwm_get_period(). By doing that we avoid messing with the current PWM
period.

The only remaining bit in pwm_apply_args() is the initial polarity
setting, and it should go away when all PWM users have been patched to
use the atomic API (with this API the polarity will be set along with
other PWM arguments when configuring the PWM).

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 14 --------------
 include/linux/pwm.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index cb762cf51332..64330595e17b 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -609,13 +609,6 @@ struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id)
 
 	pwm->label = con_id;
 
-	/*
-	 * FIXME: This should be removed once all PWM users properly make use
-	 * of struct pwm_args to initialize the PWM device. As long as this is
-	 * here, the PWM state and hardware state can get out of sync.
-	 */
-	pwm_apply_args(pwm);
-
 put:
 	of_node_put(args.np);
 
@@ -750,13 +743,6 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 	pwm->args.period = chosen->period;
 	pwm->args.polarity = chosen->polarity;
 
-	/*
-	 * FIXME: This should be removed once all PWM users properly make use
-	 * of struct pwm_args to initialize the PWM device. As long as this is
-	 * here, the PWM state and hardware state can get out of sync.
-	 */
-	pwm_apply_args(pwm);
-
 out:
 	mutex_unlock(&pwm_lookup_lock);
 	return pwm;
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d2e7430ccedb..7caf549f720e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -171,7 +171,6 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 
 static inline void pwm_apply_args(struct pwm_device *pwm)
 {
-	pwm_set_period(pwm, pwm->args.period);
 	pwm_set_polarity(pwm, pwm->args.polarity);
 }
 
-- 
cgit v1.2.3


From 43a276b003ed2e03de9d94b02a1ba49c1849b931 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:38 +0200
Subject: pwm: Introduce the pwm_state concept

The PWM state, represented by its period, duty_cycle and polarity is
currently directly stored in the PWM device. Declare a pwm_state
structure embedding those field so that we can later use this struct
to atomically update all the PWM parameters at once.

All pwm_get_xxx() helpers are now implemented as wrappers around
pwm_get_state().

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  |  8 ++++----
 include/linux/pwm.h | 54 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 64330595e17b..f3f91e716a42 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -268,7 +268,7 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->chip = chip;
 		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
-		pwm->polarity = polarity;
+		pwm->state.polarity = polarity;
 
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
@@ -446,8 +446,8 @@ int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
 	if (err)
 		return err;
 
-	pwm->duty_cycle = duty_ns;
-	pwm->period = period_ns;
+	pwm->state.duty_cycle = duty_ns;
+	pwm->state.period = period_ns;
 
 	return 0;
 }
@@ -480,7 +480,7 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 	if (err)
 		return err;
 
-	pwm->polarity = polarity;
+	pwm->state.polarity = polarity;
 
 	return 0;
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 7caf549f720e..51d4005418f9 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -98,6 +98,18 @@ enum {
 	PWMF_EXPORTED = 1 << 2,
 };
 
+/*
+ * struct pwm_state - state of a PWM channel
+ * @period: PWM period (in nanoseconds)
+ * @duty_cycle: PWM duty cycle (in nanoseconds)
+ * @polarity: PWM polarity
+ */
+struct pwm_state {
+	unsigned int period;
+	unsigned int duty_cycle;
+	enum pwm_polarity polarity;
+};
+
 /**
  * struct pwm_device - PWM channel object
  * @label: name of the PWM device
@@ -106,10 +118,8 @@ enum {
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @chip_data: chip-private data associated with the PWM device
- * @period: period of the PWM signal (in nanoseconds)
- * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
- * @polarity: polarity of the PWM signal
  * @args: PWM arguments
+ * @state: curent PWM channel state
  */
 struct pwm_device {
 	const char *label;
@@ -119,13 +129,21 @@ struct pwm_device {
 	struct pwm_chip *chip;
 	void *chip_data;
 
-	unsigned int period;
-	unsigned int duty_cycle;
-	enum pwm_polarity polarity;
-
 	struct pwm_args args;
+	struct pwm_state state;
 };
 
+/**
+ * pwm_get_state() - retrieve the current PWM state
+ * @pwm: PWM device
+ * @state: state to fill with the current PWM state
+ */
+static inline void pwm_get_state(const struct pwm_device *pwm,
+				 struct pwm_state *state)
+{
+	*state = pwm->state;
+}
+
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 {
 	return test_bit(PWMF_ENABLED, &pwm->flags);
@@ -134,23 +152,31 @@ static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 {
 	if (pwm)
-		pwm->period = period;
+		pwm->state.period = period;
 }
 
 static inline unsigned int pwm_get_period(const struct pwm_device *pwm)
 {
-	return pwm ? pwm->period : 0;
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.period;
 }
 
 static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
 {
 	if (pwm)
-		pwm->duty_cycle = duty;
+		pwm->state.duty_cycle = duty;
 }
 
 static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
-	return pwm ? pwm->duty_cycle : 0;
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.duty_cycle;
 }
 
 /*
@@ -160,7 +186,11 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
 
 static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
 {
-	return pwm ? pwm->polarity : PWM_POLARITY_NORMAL;
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.polarity;
 }
 
 static inline void pwm_get_args(const struct pwm_device *pwm,
-- 
cgit v1.2.3


From 09a7e4a3d9fcb95ade2cb02167e85fbeb8315ce0 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:39 +0200
Subject: pwm: Move the enabled/disabled info into pwm_state

Prepare the transition to PWM atomic update by moving the enabled and
disabled state into the pwm_state struct. This way we can easily update
the whole PWM state by copying the new state in the ->state field.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 13 +++++++++----
 include/linux/pwm.h | 11 ++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index f3f91e716a42..c240b5437145 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -499,10 +499,10 @@ int pwm_enable(struct pwm_device *pwm)
 	if (!pwm)
 		return -EINVAL;
 
-	if (!test_and_set_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		err = pwm->chip->ops->enable(pwm->chip, pwm);
-		if (err)
-			clear_bit(PWMF_ENABLED, &pwm->flags);
+		if (!err)
+			pwm->state.enabled = true;
 	}
 
 	return err;
@@ -515,8 +515,13 @@ EXPORT_SYMBOL_GPL(pwm_enable);
  */
 void pwm_disable(struct pwm_device *pwm)
 {
-	if (pwm && test_and_clear_bit(PWMF_ENABLED, &pwm->flags))
+	if (!pwm)
+		return;
+
+	if (pwm_is_enabled(pwm)) {
 		pwm->chip->ops->disable(pwm->chip, pwm);
+		pwm->state.enabled = false;
+	}
 }
 EXPORT_SYMBOL_GPL(pwm_disable);
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 51d4005418f9..150563a806d6 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -94,8 +94,7 @@ struct pwm_args {
 
 enum {
 	PWMF_REQUESTED = 1 << 0,
-	PWMF_ENABLED = 1 << 1,
-	PWMF_EXPORTED = 1 << 2,
+	PWMF_EXPORTED = 1 << 1,
 };
 
 /*
@@ -103,11 +102,13 @@ enum {
  * @period: PWM period (in nanoseconds)
  * @duty_cycle: PWM duty cycle (in nanoseconds)
  * @polarity: PWM polarity
+ * @enabled: PWM enabled status
  */
 struct pwm_state {
 	unsigned int period;
 	unsigned int duty_cycle;
 	enum pwm_polarity polarity;
+	bool enabled;
 };
 
 /**
@@ -146,7 +147,11 @@ static inline void pwm_get_state(const struct pwm_device *pwm,
 
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 {
-	return test_bit(PWMF_ENABLED, &pwm->flags);
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.enabled;
 }
 
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
-- 
cgit v1.2.3


From 15fa8a43c147213a9563903c87b29671035eb6e8 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:40 +0200
Subject: pwm: Add hardware readout infrastructure

Add a ->get_state() function to the pwm_ops struct to let PWM drivers
initialize the PWM state attached to a PWM device.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  |  3 +++
 include/linux/pwm.h | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index c240b5437145..a909c64ee863 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -270,6 +270,9 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->hwpwm = i;
 		pwm->state.polarity = polarity;
 
+		if (chip->ops->get_state)
+			chip->ops->get_state(chip, pwm, &pwm->state);
+
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 150563a806d6..33f8decd9f38 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -206,6 +206,29 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 
 static inline void pwm_apply_args(struct pwm_device *pwm)
 {
+	/*
+	 * PWM users calling pwm_apply_args() expect to have a fresh config
+	 * where the polarity and period are set according to pwm_args info.
+	 * The problem is, polarity can only be changed when the PWM is
+	 * disabled.
+	 *
+	 * PWM drivers supporting hardware readout may declare the PWM device
+	 * as enabled, and prevent polarity setting, which changes from the
+	 * existing behavior, where all PWM devices are declared as disabled
+	 * at startup (even if they are actually enabled), thus authorizing
+	 * polarity setting.
+	 *
+	 * Instead of setting ->enabled to false, we call pwm_disable()
+	 * before pwm_set_polarity() to ensure that everything is configured
+	 * as expected, and the PWM is really disabled when the user request
+	 * it.
+	 *
+	 * Note that PWM users requiring a smooth handover between the
+	 * bootloader and the kernel (like critical regulators controlled by
+	 * PWM devices) will have to switch to the atomic API and avoid calling
+	 * pwm_apply_args().
+	 */
+	pwm_disable(pwm);
 	pwm_set_polarity(pwm, pwm->args.polarity);
 }
 
@@ -217,6 +240,9 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
  * @set_polarity: configure the polarity of this PWM
  * @enable: enable PWM output toggling
  * @disable: disable PWM output toggling
+ * @get_state: get the current PWM state. This function is only
+ *	       called once per PWM device when the PWM chip is
+ *	       registered.
  * @dbg_show: optional routine to show contents in debugfs
  * @owner: helps prevent removal of modules exporting active PWMs
  */
@@ -229,6 +255,8 @@ struct pwm_ops {
 			    enum pwm_polarity polarity);
 	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
 	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
+	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
+			  struct pwm_state *state);
 #ifdef CONFIG_DEBUG_FS
 	void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
 #endif
-- 
cgit v1.2.3


From 5ec803edcb703fe379836f13560b79dfac79b01d Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:41 +0200
Subject: pwm: Add core infrastructure to allow atomic updates

Add an ->apply() method to the pwm_ops struct to allow PWM drivers to
implement atomic updates. This method is preferred over the ->enable(),
->disable() and ->config() methods if available.

Add the pwm_apply_state() function to the PWM user API.

Note that the pwm_apply_state() does not guarantee the atomicity of the
update operation, it all depends on the availability and implementation
of the ->apply() method.

pwm_enable/disable/set_polarity/config() are now implemented as wrappers
around the pwm_apply_state() function.

pwm_adjust_config() is allowing smooth handover between the bootloader
and the kernel. This function tries to adapt the current PWM state to
the PWM arguments coming from a PWM lookup table or a DT definition
without changing the duty_cycle/period proportion.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
[thierry.reding@gmail.com: fix a couple of typos]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 187 +++++++++++++++++++++++-------------
 include/linux/pwm.h | 269 +++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 303 insertions(+), 153 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index a909c64ee863..729d457861fd 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -226,6 +226,19 @@ void *pwm_get_chip_data(struct pwm_device *pwm)
 }
 EXPORT_SYMBOL_GPL(pwm_get_chip_data);
 
+static bool pwm_ops_check(const struct pwm_ops *ops)
+{
+	/* driver supports legacy, non-atomic operation */
+	if (ops->config && ops->enable && ops->disable)
+		return true;
+
+	/* driver supports atomic operation */
+	if (ops->apply)
+		return true;
+
+	return false;
+}
+
 /**
  * pwmchip_add_with_polarity() - register a new PWM chip
  * @chip: the PWM chip to add
@@ -244,8 +257,10 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 	unsigned int i;
 	int ret;
 
-	if (!chip || !chip->dev || !chip->ops || !chip->ops->config ||
-	    !chip->ops->enable || !chip->ops->disable || !chip->npwm)
+	if (!chip || !chip->dev || !chip->ops || !chip->npwm)
+		return -EINVAL;
+
+	if (!pwm_ops_check(chip->ops))
 		return -EINVAL;
 
 	mutex_lock(&pwm_lock);
@@ -431,102 +446,138 @@ void pwm_free(struct pwm_device *pwm)
 EXPORT_SYMBOL_GPL(pwm_free);
 
 /**
- * pwm_config() - change a PWM device configuration
+ * pwm_apply_state() - atomically apply a new state to a PWM device
  * @pwm: PWM device
- * @duty_ns: "on" time (in nanoseconds)
- * @period_ns: duration (in nanoseconds) of one cycle
- *
- * Returns: 0 on success or a negative error code on failure.
+ * @state: new state to apply. This can be adjusted by the PWM driver
+ *	   if the requested config is not achievable, for example,
+ *	   ->duty_cycle and ->period might be approximated.
  */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state)
 {
 	int err;
 
-	if (!pwm || duty_ns < 0 || period_ns <= 0 || duty_ns > period_ns)
+	if (!pwm)
 		return -EINVAL;
 
-	err = pwm->chip->ops->config(pwm->chip, pwm, duty_ns, period_ns);
-	if (err)
-		return err;
-
-	pwm->state.duty_cycle = duty_ns;
-	pwm->state.period = period_ns;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pwm_config);
+	if (!memcmp(state, &pwm->state, sizeof(*state)))
+		return 0;
 
-/**
- * pwm_set_polarity() - configure the polarity of a PWM signal
- * @pwm: PWM device
- * @polarity: new polarity of the PWM signal
- *
- * Note that the polarity cannot be configured while the PWM device is
- * enabled.
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
-{
-	int err;
+	if (pwm->chip->ops->apply) {
+		err = pwm->chip->ops->apply(pwm->chip, pwm, state);
+		if (err)
+			return err;
 
-	if (!pwm || !pwm->chip->ops)
-		return -EINVAL;
+		pwm->state = *state;
+	} else {
+		/*
+		 * FIXME: restore the initial state in case of error.
+		 */
+		if (state->polarity != pwm->state.polarity) {
+			if (!pwm->chip->ops->set_polarity)
+				return -ENOTSUPP;
+
+			/*
+			 * Changing the polarity of a running PWM is
+			 * only allowed when the PWM driver implements
+			 * ->apply().
+			 */
+			if (pwm->state.enabled) {
+				pwm->chip->ops->disable(pwm->chip, pwm);
+				pwm->state.enabled = false;
+			}
+
+			err = pwm->chip->ops->set_polarity(pwm->chip, pwm,
+							   state->polarity);
+			if (err)
+				return err;
+
+			pwm->state.polarity = state->polarity;
+		}
 
-	if (!pwm->chip->ops->set_polarity)
-		return -ENOSYS;
+		if (state->period != pwm->state.period ||
+		    state->duty_cycle != pwm->state.duty_cycle) {
+			err = pwm->chip->ops->config(pwm->chip, pwm,
+						     state->duty_cycle,
+						     state->period);
+			if (err)
+				return err;
 
-	if (pwm_is_enabled(pwm))
-		return -EBUSY;
+			pwm->state.duty_cycle = state->duty_cycle;
+			pwm->state.period = state->period;
+		}
 
-	err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
-	if (err)
-		return err;
+		if (state->enabled != pwm->state.enabled) {
+			if (state->enabled) {
+				err = pwm->chip->ops->enable(pwm->chip, pwm);
+				if (err)
+					return err;
+			} else {
+				pwm->chip->ops->disable(pwm->chip, pwm);
+			}
 
-	pwm->state.polarity = polarity;
+			pwm->state.enabled = state->enabled;
+		}
+	}
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pwm_set_polarity);
+EXPORT_SYMBOL_GPL(pwm_apply_state);
 
 /**
- * pwm_enable() - start a PWM output toggling
+ * pwm_adjust_config() - adjust the current PWM config to the PWM arguments
  * @pwm: PWM device
  *
- * Returns: 0 on success or a negative error code on failure.
+ * This function will adjust the PWM config to the PWM arguments provided
+ * by the DT or PWM lookup table. This is particularly useful to adapt
+ * the bootloader config to the Linux one.
  */
-int pwm_enable(struct pwm_device *pwm)
+int pwm_adjust_config(struct pwm_device *pwm)
 {
-	int err = 0;
+	struct pwm_state state;
+	struct pwm_args pargs;
 
-	if (!pwm)
-		return -EINVAL;
+	pwm_get_args(pwm, &pargs);
+	pwm_get_state(pwm, &state);
+
+	/*
+	 * If the current period is zero it means that either the PWM driver
+	 * does not support initial state retrieval or the PWM has not yet
+	 * been configured.
+	 *
+	 * In either case, we setup the new period and polarity, and assign a
+	 * duty cycle of 0.
+	 */
+	if (!state.period) {
+		state.duty_cycle = 0;
+		state.period = pargs.period;
+		state.polarity = pargs.polarity;
 
-	if (!pwm_is_enabled(pwm)) {
-		err = pwm->chip->ops->enable(pwm->chip, pwm);
-		if (!err)
-			pwm->state.enabled = true;
+		return pwm_apply_state(pwm, &state);
 	}
 
-	return err;
-}
-EXPORT_SYMBOL_GPL(pwm_enable);
+	/*
+	 * Adjust the PWM duty cycle/period based on the period value provided
+	 * in PWM args.
+	 */
+	if (pargs.period != state.period) {
+		u64 dutycycle = (u64)state.duty_cycle * pargs.period;
 
-/**
- * pwm_disable() - stop a PWM output toggling
- * @pwm: PWM device
- */
-void pwm_disable(struct pwm_device *pwm)
-{
-	if (!pwm)
-		return;
+		do_div(dutycycle, state.period);
+		state.duty_cycle = dutycycle;
+		state.period = pargs.period;
+	}
 
-	if (pwm_is_enabled(pwm)) {
-		pwm->chip->ops->disable(pwm->chip, pwm);
-		pwm->state.enabled = false;
+	/*
+	 * If the polarity changed, we should also change the duty cycle.
+	 */
+	if (pargs.polarity != state.polarity) {
+		state.polarity = pargs.polarity;
+		state.duty_cycle = state.period - state.duty_cycle;
 	}
+
+	return pwm_apply_state(pwm, &state);
 }
-EXPORT_SYMBOL_GPL(pwm_disable);
+EXPORT_SYMBOL_GPL(pwm_adjust_config);
 
 static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 33f8decd9f38..17018f3c066e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -5,59 +5,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 
-struct pwm_device;
 struct seq_file;
-
-#if IS_ENABLED(CONFIG_PWM)
-/*
- * pwm_request - request a PWM device
- */
-struct pwm_device *pwm_request(int pwm_id, const char *label);
-
-/*
- * pwm_free - free a PWM device
- */
-void pwm_free(struct pwm_device *pwm);
-
-/*
- * pwm_config - change a PWM device configuration
- */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
-
-/*
- * pwm_enable - start a PWM output toggling
- */
-int pwm_enable(struct pwm_device *pwm);
-
-/*
- * pwm_disable - stop a PWM output toggling
- */
-void pwm_disable(struct pwm_device *pwm);
-#else
-static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void pwm_free(struct pwm_device *pwm)
-{
-}
-
-static inline int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
-{
-	return -EINVAL;
-}
-
-static inline int pwm_enable(struct pwm_device *pwm)
-{
-	return -EINVAL;
-}
-
-static inline void pwm_disable(struct pwm_device *pwm)
-{
-}
-#endif
-
 struct pwm_chip;
 
 /**
@@ -184,11 +132,6 @@ static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 	return state.duty_cycle;
 }
 
-/*
- * pwm_set_polarity - configure the polarity of a PWM signal
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
-
 static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
 {
 	struct pwm_state state;
@@ -204,34 +147,6 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 	*args = pwm->args;
 }
 
-static inline void pwm_apply_args(struct pwm_device *pwm)
-{
-	/*
-	 * PWM users calling pwm_apply_args() expect to have a fresh config
-	 * where the polarity and period are set according to pwm_args info.
-	 * The problem is, polarity can only be changed when the PWM is
-	 * disabled.
-	 *
-	 * PWM drivers supporting hardware readout may declare the PWM device
-	 * as enabled, and prevent polarity setting, which changes from the
-	 * existing behavior, where all PWM devices are declared as disabled
-	 * at startup (even if they are actually enabled), thus authorizing
-	 * polarity setting.
-	 *
-	 * Instead of setting ->enabled to false, we call pwm_disable()
-	 * before pwm_set_polarity() to ensure that everything is configured
-	 * as expected, and the PWM is really disabled when the user request
-	 * it.
-	 *
-	 * Note that PWM users requiring a smooth handover between the
-	 * bootloader and the kernel (like critical regulators controlled by
-	 * PWM devices) will have to switch to the atomic API and avoid calling
-	 * pwm_apply_args().
-	 */
-	pwm_disable(pwm);
-	pwm_set_polarity(pwm, pwm->args.polarity);
-}
-
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
@@ -240,6 +155,10 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
  * @set_polarity: configure the polarity of this PWM
  * @enable: enable PWM output toggling
  * @disable: disable PWM output toggling
+ * @apply: atomically apply a new PWM config. The state argument
+ *	   should be adjusted with the real hardware config (if the
+ *	   approximate the period or duty_cycle value, state should
+ *	   reflect it)
  * @get_state: get the current PWM state. This function is only
  *	       called once per PWM device when the PWM chip is
  *	       registered.
@@ -255,6 +174,8 @@ struct pwm_ops {
 			    enum pwm_polarity polarity);
 	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
 	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
+	int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
+		     struct pwm_state *state);
 	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			  struct pwm_state *state);
 #ifdef CONFIG_DEBUG_FS
@@ -292,6 +213,115 @@ struct pwm_chip {
 };
 
 #if IS_ENABLED(CONFIG_PWM)
+/* PWM user APIs */
+struct pwm_device *pwm_request(int pwm_id, const char *label);
+void pwm_free(struct pwm_device *pwm);
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+int pwm_adjust_config(struct pwm_device *pwm);
+
+/**
+ * pwm_config() - change a PWM device configuration
+ * @pwm: PWM device
+ * @duty_ns: "on" time (in nanoseconds)
+ * @period_ns: duration (in nanoseconds) of one cycle
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+			     int period_ns)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return -EINVAL;
+
+	pwm_get_state(pwm, &state);
+	if (state.duty_cycle == duty_ns && state.period == period_ns)
+		return 0;
+
+	state.duty_cycle = duty_ns;
+	state.period = period_ns;
+	return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_set_polarity() - configure the polarity of a PWM signal
+ * @pwm: PWM device
+ * @polarity: new polarity of the PWM signal
+ *
+ * Note that the polarity cannot be configured while the PWM device is
+ * enabled.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+				   enum pwm_polarity polarity)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return -EINVAL;
+
+	pwm_get_state(pwm, &state);
+	if (state.polarity == polarity)
+		return 0;
+
+	/*
+	 * Changing the polarity of a running PWM without adjusting the
+	 * dutycycle/period value is a bit risky (can introduce glitches).
+	 * Return -EBUSY in this case.
+	 * Note that this is allowed when using pwm_apply_state() because
+	 * the user specifies all the parameters.
+	 */
+	if (state.enabled)
+		return -EBUSY;
+
+	state.polarity = polarity;
+	return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_enable() - start a PWM output toggling
+ * @pwm: PWM device
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return -EINVAL;
+
+	pwm_get_state(pwm, &state);
+	if (state.enabled)
+		return 0;
+
+	state.enabled = true;
+	return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_disable() - stop a PWM output toggling
+ * @pwm: PWM device
+ */
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return;
+
+	pwm_get_state(pwm, &state);
+	if (!state.enabled)
+		return;
+
+	state.enabled = false;
+	pwm_apply_state(pwm, &state);
+}
+
+
+/* PWM provider APIs */
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
@@ -317,6 +347,47 @@ void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
 
 bool pwm_can_sleep(struct pwm_device *pwm);
 #else
+static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void pwm_free(struct pwm_device *pwm)
+{
+}
+
+static inline int pwm_apply_state(struct pwm_device *pwm,
+				  const struct pwm_state *state)
+{
+	return -ENOTSUPP;
+}
+
+static inline int pwm_adjust_config(struct pwm_device *pwm)
+{
+	return -ENOTSUPP;
+}
+
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+			     int period_ns)
+{
+	return -EINVAL;
+}
+
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+				   enum pwm_polarity polarity)
+{
+	return -ENOTSUPP;
+}
+
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+	return -EINVAL;
+}
+
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+}
+
 static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
 {
 	return -EINVAL;
@@ -388,6 +459,34 @@ static inline bool pwm_can_sleep(struct pwm_device *pwm)
 }
 #endif
 
+static inline void pwm_apply_args(struct pwm_device *pwm)
+{
+	/*
+	 * PWM users calling pwm_apply_args() expect to have a fresh config
+	 * where the polarity and period are set according to pwm_args info.
+	 * The problem is, polarity can only be changed when the PWM is
+	 * disabled.
+	 *
+	 * PWM drivers supporting hardware readout may declare the PWM device
+	 * as enabled, and prevent polarity setting, which changes from the
+	 * existing behavior, where all PWM devices are declared as disabled
+	 * at startup (even if they are actually enabled), thus authorizing
+	 * polarity setting.
+	 *
+	 * Instead of setting ->enabled to false, we call pwm_disable()
+	 * before pwm_set_polarity() to ensure that everything is configured
+	 * as expected, and the PWM is really disabled when the user request
+	 * it.
+	 *
+	 * Note that PWM users requiring a smooth handover between the
+	 * bootloader and the kernel (like critical regulators controlled by
+	 * PWM devices) will have to switch to the atomic API and avoid calling
+	 * pwm_apply_args().
+	 */
+	pwm_disable(pwm);
+	pwm_set_polarity(pwm, pwm->args.polarity);
+}
+
 struct pwm_lookup {
 	struct list_head list;
 	const char *provider;
-- 
cgit v1.2.3


From c35095215a80b13af2f521e1ab1c727a5224e53b Mon Sep 17 00:00:00 2001
From: Wei Ni <wni@nvidia.com>
Date: Tue, 29 Mar 2016 18:29:17 +0800
Subject: thermal: of-thermal: allow setting trip_temp on hardware

In current of-thermal, the .set_trip_temp only support to
set trip_temp for SW. But some sensors support to set
trip_temp on hardware, so that can trigger interrupt,
shutdown or any other events.
This patch adds .set_trip_temp() callback in
thermal_zone_of_device_ops{}, so that the sensor device can
use it to set trip_temp on hardware.

Signed-off-by: Wei Ni <wni@nvidia.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 drivers/thermal/of-thermal.c | 8 ++++++++
 include/linux/thermal.h      | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index 82dd82afa555..b8e509c60848 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -331,6 +331,14 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
 	if (trip >= data->ntrips || trip < 0)
 		return -EDOM;
 
+	if (data->ops->set_trip_temp) {
+		int ret;
+
+		ret = data->ops->set_trip_temp(data->sensor_data, trip, temp);
+		if (ret)
+			return ret;
+	}
+
 	/* thermal framework should take care of data->mask & (1 << trip) */
 	data->trips[trip].temperature = temp;
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1b8a5a7876ce..e45abe7db9a6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -340,6 +340,7 @@ struct thermal_zone_of_device_ops {
 	int (*get_temp)(void *, int *);
 	int (*get_trend)(void *, long *);
 	int (*set_emul_temp)(void *, int);
+	int (*set_trip_temp)(void *, int, int);
 };
 
 /**
-- 
cgit v1.2.3


From 7ff2760999a86e4d2b1af93dcf0f0d336c309571 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 16 May 2016 15:35:24 +0300
Subject: mmc: core: Add a facility to "pause" re-tuning

Re-tuning is not possible when switched to the RPMB
partition.  However re-tuning should not be needed
if re-tuning is done immediately before switching,
a small set of operations is done, and then we
immediately switch back to the main partition.

To ensure that re-tuning can't be done for a short
while, add a facility to "pause" re-tuning.

The existing facility to hold / release re-tuning
is used but it also flags re-tuning as needed to cause
re-tuning before the next command (which will be the
switch to RPMB).

We also need to "unpause" in the recovery path, which
is catered for by adding it to mmc_retune_disable().

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c  | 24 ++++++++++++++++++++++++
 include/linux/mmc/host.h |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index e0a3ee16c0d3..1be42fab1a30 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -68,8 +68,32 @@ void mmc_retune_enable(struct mmc_host *host)
 			  jiffies + host->retune_period * HZ);
 }
 
+/*
+ * Pause re-tuning for a small set of operations.  The pause begins after the
+ * next command and after first doing re-tuning.
+ */
+void mmc_retune_pause(struct mmc_host *host)
+{
+	if (!host->retune_paused) {
+		host->retune_paused = 1;
+		mmc_retune_needed(host);
+		mmc_retune_hold(host);
+	}
+}
+EXPORT_SYMBOL(mmc_retune_pause);
+
+void mmc_retune_unpause(struct mmc_host *host)
+{
+	if (host->retune_paused) {
+		host->retune_paused = 0;
+		mmc_retune_release(host);
+	}
+}
+EXPORT_SYMBOL(mmc_retune_unpause);
+
 void mmc_retune_disable(struct mmc_host *host)
 {
+	mmc_retune_unpause(host);
 	host->can_retune = 0;
 	del_timer_sync(&host->retune_timer);
 	host->retune_now = 0;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 85800b48241f..45cde8cd39f2 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -329,6 +329,7 @@ struct mmc_host {
 	unsigned int		can_retune:1;	/* re-tuning can be used */
 	unsigned int		doing_retune:1;	/* re-tuning in progress */
 	unsigned int		retune_now:1;	/* do re-tuning at next req */
+	unsigned int		retune_paused:1; /* re-tuning is temporarily disabled */
 
 	int			rescan_disable;	/* disable card detection */
 	int			rescan_entered;	/* used with nonremovable devices */
@@ -526,4 +527,7 @@ static inline void mmc_retune_recheck(struct mmc_host *host)
 		host->retune_now = 1;
 }
 
+void mmc_retune_pause(struct mmc_host *host);
+void mmc_retune_unpause(struct mmc_host *host);
+
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3


From 2e72448b07dc3ff1b7593e9bfff91db182262857 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 21 May 2013 16:53:03 -0400
Subject: NFS: Add COPY nfs operation

This adds the copy_range file_ops function pointer used by the
sys_copy_range() function call.  This patch only implements sync copies,
so if an async copy happens we decode the stateid and ignore it.

Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
---
 fs/nfs/nfs42.h            |   1 +
 fs/nfs/nfs42proc.c        | 105 +++++++++++++++++++++++++++++++++
 fs/nfs/nfs42xdr.c         | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4file.c         |  23 ++++++++
 fs/nfs/nfs4proc.c         |   1 +
 fs/nfs/nfs4xdr.c          |   1 +
 include/linux/nfs4.h      |   1 +
 include/linux/nfs_fs_sb.h |   1 +
 include/linux/nfs_xdr.h   |  26 +++++++++
 9 files changed, 305 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b587ccd31083..b6cd15314bab 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@
 
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dff83460e5a6..579ee20e4120 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	return err;
 }
 
+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+				struct nfs_lock_context *src_lock,
+				struct file *dst, loff_t pos_dst,
+				struct nfs_lock_context *dst_lock,
+				size_t count)
+{
+	struct nfs42_copy_args args = {
+		.src_fh		= NFS_FH(file_inode(src)),
+		.src_pos	= pos_src,
+		.dst_fh		= NFS_FH(file_inode(dst)),
+		.dst_pos	= pos_dst,
+		.count		= count,
+	};
+	struct nfs42_copy_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct inode *dst_inode = file_inode(dst);
+	struct nfs_server *server = NFS_SERVER(dst_inode);
+	int status;
+
+	status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+				     src_lock, FMODE_READ);
+	if (status)
+		return status;
+
+	status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+				     dst_lock, FMODE_WRITE);
+	if (status)
+		return status;
+
+	status = nfs4_call_sync(server->client, server, &msg,
+				&args.seq_args, &res.seq_res, 0);
+	if (status == -ENOTSUPP)
+		server->caps &= ~NFS_CAP_COPY;
+	if (status)
+		return status;
+
+	if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+		status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+		if (status)
+			return status;
+	}
+
+	truncate_pagecache_range(dst_inode, pos_dst,
+				 pos_dst + res.write_res.count);
+
+	return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+			struct file *dst, loff_t pos_dst,
+			size_t count)
+{
+	struct nfs_server *server = NFS_SERVER(file_inode(dst));
+	struct nfs_lock_context *src_lock;
+	struct nfs_lock_context *dst_lock;
+	struct nfs4_exception src_exception = { };
+	struct nfs4_exception dst_exception = { };
+	ssize_t err, err2;
+
+	if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+		return -EOPNOTSUPP;
+
+	src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+	if (IS_ERR(src_lock))
+		return PTR_ERR(src_lock);
+
+	src_exception.inode = file_inode(src);
+	src_exception.state = src_lock->open_context->state;
+
+	dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+	if (IS_ERR(dst_lock)) {
+		err = PTR_ERR(dst_lock);
+		goto out_put_src_lock;
+	}
+
+	dst_exception.inode = file_inode(dst);
+	dst_exception.state = dst_lock->open_context->state;
+
+	do {
+		mutex_lock(&file_inode(dst)->i_mutex);
+		err = _nfs42_proc_copy(src, pos_src, src_lock,
+				       dst, pos_dst, dst_lock, count);
+		mutex_unlock(&file_inode(dst)->i_mutex);
+
+		if (err == -ENOTSUPP) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+
+		err2 = nfs4_handle_exception(server, err, &src_exception);
+		err  = nfs4_handle_exception(server, err, &dst_exception);
+		if (!err)
+			err = err2;
+	} while (src_exception.retry || dst_exception.retry);
+
+	nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+	nfs_put_lock_context(src_lock);
+	return err;
+}
+
 static loff_t _nfs42_proc_llseek(struct file *filep,
 		struct nfs_lock_context *lock, loff_t offset, int whence)
 {
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0ca482a51e53..6dc6f2aea0d6 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -9,9 +9,22 @@
 #define encode_fallocate_maxsz		(encode_stateid_maxsz + \
 					 2 /* offset */ + \
 					 2 /* length */)
+#define NFS42_WRITE_RES_SIZE		(1 /* wr_callback_id size */ +\
+					 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+					 2 /* wr_count */ + \
+					 1 /* wr_committed */ + \
+					 XDR_QUADLEN(NFS4_VERIFIER_SIZE))
 #define encode_allocate_maxsz		(op_encode_hdr_maxsz + \
 					 encode_fallocate_maxsz)
 #define decode_allocate_maxsz		(op_decode_hdr_maxsz)
+#define encode_copy_maxsz		(op_encode_hdr_maxsz +          \
+					 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+					 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+					 2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz		(op_decode_hdr_maxsz + \
+					 NFS42_WRITE_RES_SIZE + \
+					 1 /* cr_consecutive */ + \
+					 1 /* cr_synchronous */)
 #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
 					 encode_fallocate_maxsz)
 #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
@@ -49,6 +62,16 @@
 					 decode_putfh_maxsz + \
 					 decode_allocate_maxsz + \
 					 decode_getattr_maxsz)
+#define NFS4_enc_copy_sz		(compound_encode_hdr_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_savefh_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_copy_maxsz)
+#define NFS4_dec_copy_sz		(compound_decode_hdr_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_savefh_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_copy_maxsz)
 #define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
 	encode_fallocate(xdr, args);
 }
 
+static void encode_copy(struct xdr_stream *xdr,
+			struct nfs42_copy_args *args,
+			struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->src_stateid);
+	encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+	encode_uint64(xdr, args->src_pos);
+	encode_uint64(xdr, args->dst_pos);
+	encode_uint64(xdr, args->count);
+
+	encode_uint32(xdr, 1); /* consecutive = true */
+	encode_uint32(xdr, 1); /* synchronous = true */
+	encode_uint32(xdr, 0); /* src server list */
+}
+
 static void encode_deallocate(struct xdr_stream *xdr,
 			      struct nfs42_falloc_args *args,
 			      struct compound_hdr *hdr)
@@ -181,6 +221,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+			      struct xdr_stream *xdr,
+			      struct nfs42_copy_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->src_fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dst_fh, &hdr);
+	encode_copy(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
 /*
  * Encode DEALLOCATE request
  */
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 	return decode_op_hdr(xdr, OP_ALLOCATE);
 }
 
+static int decode_write_response(struct xdr_stream *xdr,
+				 struct nfs42_write_res *res)
+{
+	__be32 *p;
+	int stateids;
+
+	p = xdr_inline_decode(xdr, 4 + 8 + 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	stateids = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &res->count);
+	res->verifier.committed = be32_to_cpup(p);
+	return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+				    struct nfs42_copy_res *res) {
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->consecutive = be32_to_cpup(p++);
+	res->synchronous = be32_to_cpup(p++);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_COPY);
+	if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+		status = decode_copy_requirements(xdr, res);
+		if (status)
+			return status;
+		return NFS4ERR_OFFLOAD_NO_REQS;
+	} else if (status)
+		return status;
+
+	status = decode_write_response(xdr, &res->write_res);
+	if (status)
+		return status;
+
+	return decode_copy_requirements(xdr, res);
+}
+
 static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -330,6 +446,36 @@ out:
 	return status;
 }
 
+/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+			     struct xdr_stream *xdr,
+			     struct nfs42_copy_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_copy(xdr, res);
+out:
+	return status;
+}
+
 /*
  * Decode DEALLOCATE request
  */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d0390516467c..014b0e41ace5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
 }
 
 #ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t count, unsigned int flags)
+{
+	struct inode *in_inode = file_inode(file_in);
+	struct inode *out_inode = file_inode(file_out);
+	int ret;
+
+	if (in_inode == out_inode)
+		return -EINVAL;
+
+	/* flush any pending writes */
+	ret = nfs_sync_inode(in_inode);
+	if (ret)
+		return ret;
+	ret = nfs_sync_inode(out_inode);
+	if (ret)
+		return ret;
+
+	return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
 {
 	loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
 	.check_flags	= nfs_check_flags,
 	.setlease	= simple_nosetlease,
 #ifdef CONFIG_NFS_V4_2
+	.copy_file_range = nfs4_copy_file_range,
 	.llseek		= nfs4_file_llseek,
 	.fallocate	= nfs42_fallocate,
 	.clone_file_range = nfs42_clone_file_range,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bc2676c95e1b..4e83385ea6a9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8797,6 +8797,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
 		| NFS_CAP_ALLOCATE
+		| NFS_CAP_COPY
 		| NFS_CAP_DEALLOCATE
 		| NFS_CAP_SEEK
 		| NFS_CAP_LAYOUTSTATS
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 88474a4fc669..d1c96fc62c51 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7515,6 +7515,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
 	PROC(LAYOUTSTATS,	enc_layoutstats,	dec_layoutstats),
 	PROC(CLONE,		enc_clone,		dec_clone),
+	PROC(COPY,		enc_copy,		dec_copy),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 011433478a14..722509482e1a 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -504,6 +504,7 @@ enum {
 	NFSPROC4_CLNT_DEALLOCATE,
 	NFSPROC4_CLNT_LAYOUTSTATS,
 	NFSPROC4_CLNT_CLONE,
+	NFSPROC4_CLNT_COPY,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7fcc13c8cf1f..14a762d2734d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -246,5 +246,6 @@ struct nfs_server {
 #define NFS_CAP_DEALLOCATE	(1U << 21)
 #define NFS_CAP_LAYOUTSTATS	(1U << 22)
 #define NFS_CAP_CLONE		(1U << 23)
+#define NFS_CAP_COPY		(1U << 24)
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cb9982d8f38f..e70ed54dad94 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1343,6 +1343,32 @@ struct nfs42_falloc_res {
 	const struct nfs_server		*falloc_server;
 };
 
+struct nfs42_copy_args {
+	struct nfs4_sequence_args	seq_args;
+
+	struct nfs_fh			*src_fh;
+	nfs4_stateid			src_stateid;
+	u64				src_pos;
+
+	struct nfs_fh			*dst_fh;
+	nfs4_stateid			dst_stateid;
+	u64				dst_pos;
+
+	u64				count;
+};
+
+struct nfs42_write_res {
+	u64			count;
+	struct nfs_writeverf	verifier;
+};
+
+struct nfs42_copy_res {
+	struct nfs4_sequence_res	seq_res;
+	struct nfs42_write_res		write_res;
+	bool				consecutive;
+	bool				synchronous;
+};
+
 struct nfs42_seek_args {
 	struct nfs4_sequence_args	seq_args;
 
-- 
cgit v1.2.3


From 4b9c7f9db9a003f5c342184dc4401c1b7f2efb39 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 2 May 2016 14:40:31 -0400
Subject: sunrpc: Update RPCBIND_MAXNETIDLEN

Commit 176e21ee2ec8 ("SUNRPC: Support for RPC over AF_LOCAL
transports") added a 5-character netid, but did not bump
RPCBIND_MAXNETIDLEN from 4 to 5.

Fixes: 176e21ee2ec8 ("SUNRPC: Support for RPC over AF_LOCAL ...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/msg_prot.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 807371357160..59cbf16eaeb5 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -158,9 +158,9 @@ typedef __be32	rpc_fraghdr;
 
 /*
  * Note that RFC 1833 does not put any size restrictions on the
- * netid string, but all currently defined netid's fit in 4 bytes.
+ * netid string, but all currently defined netid's fit in 5 bytes.
  */
-#define RPCBIND_MAXNETIDLEN	(4u)
+#define RPCBIND_MAXNETIDLEN	(5u)
 
 /*
  * Universal addresses are introduced in RFC 1833 and further spelled
-- 
cgit v1.2.3


From 6b26cc8c8ead3636a18bfd9489984983f4ddd6f4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 2 May 2016 14:40:40 -0400
Subject: sunrpc: Advertise maximum backchannel payload size

RPC-over-RDMA transports have a limit on how large a backward
direction (backchannel) RPC message can be. Ensure that the NFSv4.x
CREATE_SESSION operation advertises this limit to servers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c                 | 10 ++++++----
 include/linux/sunrpc/clnt.h       |  1 +
 include/linux/sunrpc/xprt.h       |  1 +
 net/sunrpc/clnt.c                 | 17 +++++++++++++++++
 net/sunrpc/xprtrdma/backchannel.c | 16 ++++++++++++++++
 net/sunrpc/xprtrdma/transport.c   |  1 +
 net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
 net/sunrpc/xprtsock.c             |  6 ++++++
 8 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9795725d708c..196e41e12621 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7371,9 +7371,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
  * always set csa_cachethis to FALSE because the current implementation
  * of the back channel DRC only supports caching the CB_SEQUENCE operation.
  */
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+				    struct rpc_clnt *clnt)
 {
 	unsigned int max_rqst_sz, max_resp_sz;
+	unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
 
 	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
 	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7391,8 +7393,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
 
 	/* Back channel attributes */
-	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
-	args->bc_attrs.max_resp_sz = PAGE_SIZE;
+	args->bc_attrs.max_rqst_sz = max_bc_payload;
+	args->bc_attrs.max_resp_sz = max_bc_payload;
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
 	args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7496,7 +7498,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	};
 	int status;
 
-	nfs4_init_channel_attrs(&args);
+	nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
 	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 9a7ddbaf116e..19c659d1c0f8 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -176,6 +176,7 @@ void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 int		rpc_protocol(struct rpc_clnt *);
 struct net *	rpc_net_ns(struct rpc_clnt *);
 size_t		rpc_max_payload(struct rpc_clnt *);
+size_t		rpc_max_bc_payload(struct rpc_clnt *);
 unsigned long	rpc_get_timeout(struct rpc_clnt *clnt);
 void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fb0d212e0d3a..5aa3834619a8 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -142,6 +142,7 @@ struct rpc_xprt_ops {
 	int		(*bc_setup)(struct rpc_xprt *xprt,
 				    unsigned int min_reqs);
 	int		(*bc_up)(struct svc_serv *serv, struct net *net);
+	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
 	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
 	void		(*bc_destroy)(struct rpc_xprt *xprt,
 				      unsigned int max_reqs);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf22df8..06b4df9faaa1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1413,6 +1413,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_payload);
 
+/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+	size_t ret;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	ret = xprt->ops->bc_maxpayload(xprt);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
 /**
  * rpc_get_timeout - Get timeout for transport in units of HZ
  * @clnt: RPC client to query
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640eeb5..87762d976b63 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -191,6 +191,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
 	return 0;
 }
 
+/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	size_t maxmsg;
+
+	maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+	return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
 /**
  * rpcrdma_bc_marshal_reply - Send backwards direction reply
  * @rqst: buffer containing RPC reply data
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10ea3..9954342924df 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -707,6 +707,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_up			= xprt_rdma_bc_up,
+	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
 #endif
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb96f..7723e5faff4d 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -534,6 +534,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 65e759569e48..f1faf6b9aaff 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1365,6 +1365,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
 		return ret;
 	return 0;
 }
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	return PAGE_SIZE;
+}
 #else
 static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
 					struct xdr_skb_reader *desc)
@@ -2660,6 +2665,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_up			= xs_tcp_bc_up,
+	.bc_maxpayload		= xs_tcp_bc_maxpayload,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,
 #endif
-- 
cgit v1.2.3


From 29c554227aeec48cde5c22f911e51763f096e125 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 2 May 2016 14:40:48 -0400
Subject: xprtrdma: Bound the inline threshold values

Currently the sysctls that allow setting the inline threshold allow
any value to be set.

Small values only make the transport run slower. The default 1KB
setting is as low as is reasonable. And the logic that decides how
to divide a Send buffer between RPC-over-RDMA header and RPC message
assumes (but does not check) that the lower bound is not crazy (say,
57 bytes).

Send and receive buffers share a page with some control information.
Values larger than about 3KB can't be supported, currently.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 4 +++-
 net/sunrpc/xprtrdma/transport.c | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 767190b01363..39267dc3486a 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -52,7 +52,9 @@
 #define RPCRDMA_DEF_SLOT_TABLE	(128U)
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 
-#define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */
+#define RPCRDMA_MIN_INLINE  (1024)	/* min inline thresh */
+#define RPCRDMA_DEF_INLINE  (1024)	/* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (3068)	/* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9954342924df..16595ff91994 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
 static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
 	},
 	{
 		.procname	= "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
 	},
 	{
 		.procname	= "rdma_inline_write_padding",
-- 
cgit v1.2.3


From 9a8f6b5ea275ff01fc8ef3b8630a3d4ed6b0a362 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 16 May 2016 17:42:42 -0400
Subject: SUNRPC: Ensure get_rpccred() and put_rpccred() can take NULL
 arguments

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 3 ++-
 net/sunrpc/auth.c           | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 6f36b2bf3e05..899791573a40 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -202,7 +202,8 @@ char *			rpcauth_stringify_acceptor(struct rpc_cred *);
 static inline
 struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
 {
-	atomic_inc(&cred->cr_count);
+	if (cred != NULL)
+		atomic_inc(&cred->cr_count);
 	return cred;
 }
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index e0bb30fd2ed3..040ff627c18a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 		new = rpcauth_bind_new_cred(task, lookupflags);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	if (req->rq_cred != NULL)
-		put_rpccred(req->rq_cred);
+	put_rpccred(req->rq_cred);
 	req->rq_cred = new;
 	return 0;
 }
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 void
 put_rpccred(struct rpc_cred *cred)
 {
+	if (cred == NULL)
+		return;
 	/* Fast path for unhashed credentials */
 	if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
 		if (atomic_dec_and_test(&cred->cr_count))
-- 
cgit v1.2.3


From 93b717fd81bf6b9a73c3702e9b079b4de8148b34 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 16 May 2016 17:42:43 -0400
Subject: NFSv4: Label stateids with the type

In order to more easily distinguish what kind of stateid we are dealing
with, introduce a type that can be used to label the stateid structure.

The label will be useful both for debugging, but also when dealing with
operations like SETATTR, READ and WRITE that can take several different
types of stateid as arguments.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/callback_xdr.c                     | 17 ++++++++++---
 fs/nfs/flexfilelayout/flexfilelayout.c    |  7 +++---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c |  3 ++-
 fs/nfs/nfs4_fs.h                          |  7 ++++--
 fs/nfs/nfs4proc.c                         |  3 +++
 fs/nfs/nfs4state.c                        |  5 +++-
 fs/nfs/nfs4xdr.c                          | 42 ++++++++++++++++++++++++-------
 include/linux/nfs4.h                      | 25 ++++++++++++++----
 8 files changed, 85 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 976c90608e56..d81f96aacd51 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 	p = read_buf(xdr, NFS4_STATEID_SIZE);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
-	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
 	return 0;
 }
 
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
 static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
 	__be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	__be32 *p;
 	__be32 status;
 
-	status = decode_stateid(xdr, &args->stateid);
+	status = decode_delegation_stateid(xdr, &args->stateid);
 	if (unlikely(status != 0))
 		goto out;
 	p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
 
 static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
 				       struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
 		}
 		p = xdr_decode_hyper(p, &args->cbl_range.offset);
 		p = xdr_decode_hyper(p, &args->cbl_range.length);
-		status = decode_stateid(xdr, &args->cbl_stateid);
+		status = decode_layout_stateid(xdr, &args->cbl_stateid);
 		if (unlikely(status != 0))
 			goto out;
 	} else if (args->cbl_recall_type == RETURN_FSID) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 60d690dbc947..53b6391e0eba 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -55,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 	kfree(FF_LAYOUT_FROM_HDR(lo));
 }
 
-static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
 	if (unlikely(p == NULL))
 		return -ENOBUFS;
-	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
 	dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
 		p[0], p[1], p[2], p[3]);
 	return 0;
@@ -465,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 
 		/* stateid */
-		rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+		rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
 		if (rc)
 			goto out_err_free;
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 56296f3df19c..eeef89359ad2 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
 		return e1->opnum < e2->opnum ? -1 : 1;
 	if (e1->status != e2->status)
 		return e1->status < e2->status ? -1 : 1;
-	ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+	ret = memcmp(e1->stateid.data, e2->stateid.data,
+			sizeof(e1->stateid.data));
 	if (ret != 0)
 		return ret;
 	ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4afdee420d25..b5d9f345c9f2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -496,12 +496,15 @@ extern struct svc_version nfs4_callback_version4;
 
 static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
 {
-	memcpy(dst, src, sizeof(*dst));
+	memcpy(dst->data, src->data, sizeof(dst->data));
+	dst->type = src->type;
 }
 
 static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
 {
-	return memcmp(dst, src, sizeof(*dst)) == 0;
+	if (dst->type != src->type)
+		return false;
+	return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
 }
 
 static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 196e41e12621..2516467ff17f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8675,6 +8675,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
 		const nfs4_stateid *s2)
 {
+	if (s1->type != s2->type)
+		return false;
+
 	if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
 		return false;
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d854693a15b0..d630f9cca0f1 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -65,7 +65,10 @@
 
 #define OPENOWNER_POOL_SIZE	8
 
-const nfs4_stateid zero_stateid;
+const nfs4_stateid zero_stateid = {
+	.data = { 0 },
+	.type = NFS4_SPECIAL_STATEID_TYPE,
+};
 static DEFINE_MUTEX(nfs_clid_init_mutex);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1c96fc62c51..661e753fe1c9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 	return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
 }
 
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_OPEN_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_LOCK_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
 	int status;
@@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_open_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 	if (status == -EIO)
 		goto out;
 	if (status == 0) {
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_lock_stateid(xdr, &res->stateid);
 		if (unlikely(status))
 			goto out;
 	} else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 	if (status != -EIO)
 		nfs_increment_lock_seqid(status, res->seqid);
 	if (status == 0)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_lock_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
 	__be32 *p;
 	int status;
 
-	status = decode_stateid(xdr, &res->delegation);
+	status = decode_delegation_stateid(xdr, &res->delegation);
 	if (unlikely(status))
 		return status;
 	p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	nfs_increment_open_seqid(status, res->seqid);
 	if (status)
 		return status;
-	status = decode_stateid(xdr, &res->stateid);
+	status = decode_open_stateid(xdr, &res->stateid);
 	if (unlikely(status))
 		return status;
 
@@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_open_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_open_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -5838,6 +5856,12 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
 				struct nfs4_getdeviceinfo_res *res)
 {
@@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 	if (unlikely(!p))
 		goto out_overflow;
 	res->return_on_close = be32_to_cpup(p);
-	decode_stateid(xdr, &res->stateid);
+	decode_layout_stateid(xdr, &res->stateid);
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
 		goto out_overflow;
 	res->lrs_present = be32_to_cpup(p);
 	if (res->lrs_present)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_layout_stateid(xdr, &res->stateid);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 722509482e1a..e1692c96cbc8 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -50,12 +50,27 @@ struct nfs4_label {
 
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
 
-struct nfs_stateid4 {
-	__be32 seqid;
-	char other[NFS4_STATEID_OTHER_SIZE];
-} __attribute__ ((packed));
+struct nfs4_stateid_struct {
+	union {
+		char data[NFS4_STATEID_SIZE];
+		struct {
+			__be32 seqid;
+			char other[NFS4_STATEID_OTHER_SIZE];
+		} __attribute__ ((packed));
+	};
+
+	enum {
+		NFS4_INVALID_STATEID_TYPE = 0,
+		NFS4_SPECIAL_STATEID_TYPE,
+		NFS4_OPEN_STATEID_TYPE,
+		NFS4_LOCK_STATEID_TYPE,
+		NFS4_DELEGATION_STATEID_TYPE,
+		NFS4_LAYOUT_STATEID_TYPE,
+		NFS4_PNFS_DS_STATEID_TYPE,
+	} type;
+};
 
-typedef struct nfs_stateid4 nfs4_stateid;
+typedef struct nfs4_stateid_struct nfs4_stateid;
 
 enum nfs_opnum4 {
 	OP_ACCESS = 3,
-- 
cgit v1.2.3


From 183d9e7b112aaed0d19c16ffcf0f8c3a86dc71e0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 17 May 2016 12:28:47 -0400
Subject: pnfs: rework LAYOUTGET retry handling

There are several problems in the way a stateid is selected for a
LAYOUTGET operation:

We pick a stateid to use in the RPC prepare op, but that makes
it difficult to serialize LAYOUTGETs that use the open stateid. That
serialization is done in pnfs_update_layout, which occurs well before
the rpc_prepare operation.

Between those two events, the i_lock is dropped and reacquired.
pnfs_update_layout can find that the list has lsegs in it and not do any
serialization, but then later pnfs_choose_layoutget_stateid ends up
choosing the open stateid.

This patch changes the client to select the stateid to use in the
LAYOUTGET earlier, when we're searching for a usable layout segment.
This way we can do it all while holding the i_lock the first time, and
ensure that we serialize any LAYOUTGET call that uses a non-layout
stateid.

This also means a rework of how LAYOUTGET replies are handled, as we
must now get the latest stateid if we want to retransmit in response
to a retryable error.

Most of those errors boil down to the fact that the layout state has
changed in some fashion. Thus, what we really want to do is to re-search
for a layout when it fails with a retryable error, so that we can avoid
reissuing the RPC at all if possible.

While the LAYOUTGET RPC is async, the initiating thread always waits for
it to complete, so it's effectively synchronous anyway. Currently, when
we need to retry a LAYOUTGET because of an error, we drive that retry
via the rpc state machine.

This means that once the call has been submitted, it runs until it
completes. So, we must move the error handling for this RPC out of the
rpc_call_done operation and into the caller.

In order to handle errors like NFS4ERR_DELAY properly, we must also
pass a pointer to the sliding timeout, which is now moved to the stack
in pnfs_update_layout.

The complicating errors are -NFS4ERR_RECALLCONFLICT and
-NFS4ERR_LAYOUTTRYLATER, as those involve a timeout after which we give
up and return NULL back to the caller. So, there is some special
handling for those errors to ensure that the layers driving the retries
can handle that appropriately.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c       | 115 ++++++++++++++++----------------------
 fs/nfs/nfs4trace.h      |  10 +++-
 fs/nfs/pnfs.c           | 144 +++++++++++++++++++++++++-----------------------
 fs/nfs/pnfs.h           |   6 +-
 include/linux/errno.h   |   1 +
 include/linux/nfs4.h    |   2 +
 include/linux/nfs_xdr.h |   2 -
 7 files changed, 136 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2a29f5d12aeb..62d67f040906 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -427,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
+		case -NFS4ERR_RECALLCONFLICT:
 			exception->delay = 1;
 			return 0;
 
@@ -7847,40 +7848,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_layoutget *lgp = calldata;
 	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
 	struct nfs4_session *session = nfs4_get_session(server);
-	int ret;
 
 	dprintk("--> %s\n", __func__);
-	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
-	 * right now covering the LAYOUTGET we are about to send.
-	 * However, that is not so catastrophic, and there seems
-	 * to be no way to prevent it completely.
-	 */
-	if (nfs41_setup_sequence(session, &lgp->args.seq_args,
-				&lgp->res.seq_res, task))
-		return;
-	ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
-					  NFS_I(lgp->args.inode)->layout,
-					  &lgp->args.range,
-					  lgp->args.ctx->state);
-	if (ret < 0)
-		rpc_exit(task, ret);
+	nfs41_setup_sequence(session, &lgp->args.seq_args,
+				&lgp->res.seq_res, task);
+	dprintk("<-- %s\n", __func__);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	nfs41_sequence_done(task, &lgp->res.seq_res);
+	dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+		struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
-	struct nfs4_state *state = NULL;
-	unsigned long timeo, now, giveup;
+	int status = task->tk_status;
 
 	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
-	if (!nfs41_sequence_done(task, &lgp->res.seq_res))
-		goto out;
-
-	switch (task->tk_status) {
+	switch (status) {
 	case 0:
 		goto out;
 
@@ -7890,57 +7885,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	 * retry go inband.
 	 */
 	case -NFS4ERR_LAYOUTUNAVAILABLE:
-		task->tk_status = -ENODATA;
+		status = -ENODATA;
 		goto out;
 	/*
 	 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
 	 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
 	 */
 	case -NFS4ERR_BADLAYOUT:
-		goto out_overflow;
+		status = -EOVERFLOW;
+		goto out;
 	/*
 	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
 	 * (or clients) writing to the same RAID stripe except when
 	 * the minlength argument is 0 (see RFC5661 section 18.43.3).
+	 *
+	 * Treat it like we would RECALLCONFLICT -- we retry for a little
+	 * while, and then eventually give up.
 	 */
 	case -NFS4ERR_LAYOUTTRYLATER:
-		if (lgp->args.minlength == 0)
-			goto out_overflow;
-	/*
-	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
-	 * existing layout before getting a new one).
-	 */
-	case -NFS4ERR_RECALLCONFLICT:
-		timeo = rpc_get_timeout(task->tk_client);
-		giveup = lgp->args.timestamp + timeo;
-		now = jiffies;
-		if (time_after(giveup, now)) {
-			unsigned long delay;
-
-			/* Delay for:
-			 * - Not less then NFS4_POLL_RETRY_MIN.
-			 * - One last time a jiffie before we give up
-			 * - exponential backoff (time_now minus start_attempt)
-			 */
-			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
-				    min((giveup - now - 1),
-					now - lgp->args.timestamp));
-
-			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
-				__func__, delay);
-			rpc_delay(task, delay);
-			/* Do not call nfs4_async_handle_error() */
-			goto out_restart;
+		if (lgp->args.minlength == 0) {
+			status = -EOVERFLOW;
+			goto out;
 		}
-		break;
+		/* Fallthrough */
+	case -NFS4ERR_RECALLCONFLICT:
+		nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
+					exception);
+		status = -ERECALLCONFLICT;
+		goto out;
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
+		exception->timeout = 0;
 		spin_lock(&inode->i_lock);
 		if (nfs4_stateid_match(&lgp->args.stateid,
 					&lgp->args.ctx->state->stateid)) {
 			spin_unlock(&inode->i_lock);
 			/* If the open stateid was bad, then recover it. */
-			state = lgp->args.ctx->state;
+			exception->state = lgp->args.ctx->state;
 			break;
 		}
 		lo = NFS_I(inode)->layout;
@@ -7958,20 +7939,16 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 			pnfs_free_lseg_list(&head);
 		} else
 			spin_unlock(&inode->i_lock);
-		goto out_restart;
+		status = -EAGAIN;
+		goto out;
 	}
-	if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
-		goto out_restart;
+
+	status = nfs4_handle_exception(server, status, exception);
+	if (exception->retry)
+		status = -EAGAIN;
 out:
 	dprintk("<-- %s\n", __func__);
-	return;
-out_restart:
-	task->tk_status = 0;
-	rpc_restart_call_prepare(task);
-	return;
-out_overflow:
-	task->tk_status = -EOVERFLOW;
-	goto out;
+	return status;
 }
 
 static size_t max_response_pages(struct nfs_server *server)
@@ -8040,7 +8017,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 };
 
 struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
 {
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -8060,6 +8037,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 		.flags = RPC_TASK_ASYNC,
 	};
 	struct pnfs_layout_segment *lseg = NULL;
+	struct nfs4_exception exception = { .timeout = *timeout };
 	int status = 0;
 
 	dprintk("--> %s\n", __func__);
@@ -8073,7 +8051,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 		return ERR_PTR(-ENOMEM);
 	}
 	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
-	lgp->args.timestamp = jiffies;
 
 	lgp->res.layoutp = &lgp->args.layout;
 	lgp->res.seq_res.sr_slot = NULL;
@@ -8083,13 +8060,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 	if (IS_ERR(task))
 		return ERR_CAST(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status == 0)
-		status = task->tk_status;
+	if (status == 0) {
+		status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+		*timeout = exception.timeout;
+	}
+
 	trace_nfs4_layoutget(lgp->args.ctx,
 			&lgp->args.range,
 			&lgp->res.range,
 			&lgp->res.stateid,
 			status);
+
 	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
 	if (status == 0 && lgp->res.layoutp->len)
 		lseg = pnfs_layout_process(lgp);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2c8d05dae5b1..9c150b153782 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 		{ PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" },	\
 		{ PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" },		\
 		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
+		{ PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },	\
+		{ PNFS_UPDATE_LAYOUT_RETRY, "retrying" },	\
 		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
 
 TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
 			u64 count,
 			enum pnfs_iomode iomode,
 			struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg,
 			enum pnfs_update_layout_reason reason
 		),
-		TP_ARGS(inode, pos, count, iomode, lo, reason),
+		TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
 			__field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
 			__field(enum pnfs_iomode, iomode)
 			__field(int, layoutstateid_seq)
 			__field(u32, layoutstateid_hash)
+			__field(long, lseg)
 			__field(enum pnfs_update_layout_reason, reason)
 		),
 		TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
 				__entry->layoutstateid_seq = 0;
 				__entry->layoutstateid_hash = 0;
 			}
+			__entry->lseg = (long)lseg;
 		),
 		TP_printk(
 			"fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"iomode=%s pos=%llu count=%llu "
-			"layoutstateid=%d:0x%08x (%s)",
+			"layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
 			__entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
 			(unsigned long long)__entry->pos,
 			(unsigned long long)__entry->count,
 			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			__entry->lseg,
 			show_pnfs_update_layout_reason(__entry->reason)
 		)
 );
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2a609af845fe..46339a7fb191 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -796,45 +796,18 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 
-int
-pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			      const struct pnfs_layout_range *range,
-			      struct nfs4_state *open_state)
-{
-	int status = 0;
-
-	dprintk("--> %s\n", __func__);
-	spin_lock(&lo->plh_inode->i_lock);
-	if (pnfs_layoutgets_blocked(lo)) {
-		status = -EAGAIN;
-	} else if (!nfs4_valid_open_stateid(open_state)) {
-		status = -EBADF;
-	} else if (list_empty(&lo->plh_segs) ||
-		   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
-		int seq;
-
-		do {
-			seq = read_seqbegin(&open_state->seqlock);
-			nfs4_stateid_copy(dst, &open_state->stateid);
-		} while (read_seqretry(&open_state->seqlock, seq));
-	} else
-		nfs4_stateid_copy(dst, &lo->plh_stateid);
-	spin_unlock(&lo->plh_inode->i_lock);
-	dprintk("<-- %s\n", __func__);
-	return status;
-}
-
 /*
-* Get layout from server.
-*    for now, assume that whole file layouts are requested.
-*    arg->offset: 0
-*    arg->length: all ones
-*/
+ * Get layout from server.
+ *    for now, assume that whole file layouts are requested.
+ *    arg->offset: 0
+ *    arg->length: all ones
+ */
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
+	   nfs4_stateid *stateid,
 	   const struct pnfs_layout_range *range,
-	   gfp_t gfp_flags)
+	   long *timeout, gfp_t gfp_flags)
 {
 	struct inode *ino = lo->plh_inode;
 	struct nfs_server *server = NFS_SERVER(ino);
@@ -868,10 +841,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	lgp->args.type = server->pnfs_curr_ld->id;
 	lgp->args.inode = ino;
 	lgp->args.ctx = get_nfs_open_context(ctx);
+	nfs4_stateid_copy(&lgp->args.stateid, stateid);
 	lgp->gfp_flags = gfp_flags;
 	lgp->cred = lo->plh_lc_cred;
 
-	return nfs4_proc_layoutget(lgp, gfp_flags);
+	return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
 }
 
 static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -1511,27 +1485,30 @@ pnfs_update_layout(struct inode *ino,
 		.offset = pos,
 		.length = count,
 	};
-	unsigned pg_offset;
+	unsigned pg_offset, seq;
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs_client *clp = server->nfs_client;
-	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_hdr *lo = NULL;
 	struct pnfs_layout_segment *lseg = NULL;
+	nfs4_stateid stateid;
+	long timeout = 0;
+	unsigned long giveup = jiffies + rpc_get_timeout(server->client);
 	bool first;
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_NO_PNFS);
 		goto out;
 	}
 
 	if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
 		goto out;
 	}
 
 	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
 		goto out;
 	}
@@ -1542,14 +1519,14 @@ lookup_again:
 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
 	if (lo == NULL) {
 		spin_unlock(&ino->i_lock);
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_NOMEM);
 		goto out;
 	}
 
 	/* Do we even need to bother with this? */
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
 		dprintk("%s matches recall, use MDS\n", __func__);
 		goto out_unlock;
@@ -1557,14 +1534,34 @@ lookup_again:
 
 	/* if LAYOUTGET already failed once we don't try again */
 	if (pnfs_layout_io_test_failed(lo, iomode)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
 		goto out_unlock;
 	}
 
-	first = list_empty(&lo->plh_segs);
-	if (first) {
-		/* The first layoutget for the file. Need to serialize per
+	lseg = pnfs_find_lseg(lo, &arg);
+	if (lseg) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+		goto out_unlock;
+	}
+
+	if (!nfs4_valid_open_stateid(ctx->state)) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+		goto out_unlock;
+	}
+
+	/*
+	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
+	 * stateid, or it has been invalidated, then we must use the open
+	 * stateid.
+	 */
+	if (lo->plh_stateid.seqid == 0 ||
+	    test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+
+		/*
+		 * The first layoutget for the file. Need to serialize per
 		 * RFC 5661 Errata 3208.
 		 */
 		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1573,18 +1570,17 @@ lookup_again:
 			wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
 				    TASK_UNINTERRUPTIBLE);
 			pnfs_put_layout_hdr(lo);
+			dprintk("%s retrying\n", __func__);
 			goto lookup_again;
 		}
+
+		first = true;
+		do {
+			seq = read_seqbegin(&ctx->state->seqlock);
+			nfs4_stateid_copy(&stateid, &ctx->state->stateid);
+		} while (read_seqretry(&ctx->state->seqlock, seq));
 	} else {
-		/* Check to see if the layout for the given range
-		 * already exists
-		 */
-		lseg = pnfs_find_lseg(lo, &arg);
-		if (lseg) {
-			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
-					PNFS_UPDATE_LAYOUT_FOUND_CACHED);
-			goto out_unlock;
-		}
+		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
 	}
 
 	/*
@@ -1599,15 +1595,17 @@ lookup_again:
 				pnfs_clear_first_layoutget(lo);
 			pnfs_put_layout_hdr(lo);
 			dprintk("%s retrying\n", __func__);
+			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+					lseg, PNFS_UPDATE_LAYOUT_RETRY);
 			goto lookup_again;
 		}
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				PNFS_UPDATE_LAYOUT_RETURN);
 		goto out_put_layout_hdr;
 	}
 
 	if (pnfs_layoutgets_blocked(lo)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				PNFS_UPDATE_LAYOUT_BLOCKED);
 		goto out_unlock;
 	}
@@ -1632,26 +1630,36 @@ lookup_again:
 	if (arg.length != NFS4_MAX_UINT64)
 		arg.length = PAGE_ALIGN(arg.length);
 
-	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+	lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 	if (IS_ERR(lseg)) {
-		if (lseg == ERR_PTR(-EAGAIN)) {
+		switch(PTR_ERR(lseg)) {
+		case -ERECALLCONFLICT:
+			if (time_after(jiffies, giveup))
+				lseg = NULL;
+			/* Fallthrough */
+		case -EAGAIN:
+			pnfs_put_layout_hdr(lo);
 			if (first)
 				pnfs_clear_first_layoutget(lo);
-			pnfs_put_layout_hdr(lo);
-			goto lookup_again;
-		}
-
-		if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
-			pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
-			lseg = NULL;
+			if (lseg) {
+				trace_pnfs_update_layout(ino, pos, count,
+					iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+				goto lookup_again;
+			}
+			/* Fallthrough */
+		default:
+			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+				lseg = NULL;
+			}
 		}
 	} else {
 		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 	}
 
 	atomic_dec(&lo->plh_outstanding);
-	trace_pnfs_update_layout(ino, pos, count, iomode, lo,
-				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 971068b58647..f9f3331bef49 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -228,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev,
 				   struct rpc_cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 
 /* pnfs.c */
@@ -260,10 +260,6 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 			     const nfs4_stateid *new,
 			     bool update_barrier);
-int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
-				  struct pnfs_layout_hdr *lo,
-				  const struct pnfs_layout_range *range,
-				  struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
 				const struct pnfs_layout_range *recall_range,
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 89627b9187f9..7ce9fb1b7d28 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,5 +28,6 @@
 #define EBADTYPE	527	/* Type not supported by server */
 #define EJUKEBOX	528	/* Request initiated, but will not complete before timeout */
 #define EIOCBQUEUED	529	/* iocb queued, will get completion event */
+#define ERECALLCONFLICT	530	/* conflict with recalled state */
 
 #endif
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e1692c96cbc8..bfed6b367350 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -637,7 +637,9 @@ enum pnfs_update_layout_reason {
 	PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
 	PNFS_UPDATE_LAYOUT_FOUND_CACHED,
 	PNFS_UPDATE_LAYOUT_RETURN,
+	PNFS_UPDATE_LAYOUT_RETRY,
 	PNFS_UPDATE_LAYOUT_BLOCKED,
+	PNFS_UPDATE_LAYOUT_INVALID_OPEN,
 	PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
 };
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e70ed54dad94..ccb2928a0e64 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -233,7 +233,6 @@ struct nfs4_layoutget_args {
 	struct inode *inode;
 	struct nfs_open_context *ctx;
 	nfs4_stateid stateid;
-	unsigned long timestamp;
 	struct nfs4_layoutdriver_data layout;
 };
 
@@ -251,7 +250,6 @@ struct nfs4_layoutget {
 	struct nfs4_layoutget_res res;
 	struct rpc_cred *cred;
 	gfp_t gfp_flags;
-	long timeout;
 };
 
 struct nfs4_getdeviceinfo_args {
-- 
cgit v1.2.3


From b52207ef4ea56f8c22288ec3387399aac72c26cf Mon Sep 17 00:00:00 2001
From: Chen Feng <puck.chen@hisilicon.com>
Date: Sun, 14 Feb 2016 14:29:21 +0800
Subject: mfd: hi655x: Add MFD driver for hi655x

Add PMIC MFD driver to support hisilicon hi665x.

Signed-off-by: Chen Feng <puck.chen@hisilicon.com>
Signed-off-by: Fei Wang <w.f@huawei.com>
Signed-off-by: Xinwei Kong <kong.kongxinwei@hisilicon.com>
Reviewed-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig             |  10 +++
 drivers/mfd/Makefile            |   1 +
 drivers/mfd/hi655x-pmic.c       | 162 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/hi655x-pmic.h |  55 ++++++++++++++
 4 files changed, 228 insertions(+)
 create mode 100644 drivers/mfd/hi655x-pmic.c
 create mode 100644 include/linux/mfd/hi655x-pmic.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index a49565cc1a6b..1bcf601de5bc 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -319,6 +319,16 @@ config MFD_HI6421_PMIC
 	  menus in order to enable them.
 	  We communicate with the Hi6421 via memory-mapped I/O.
 
+config MFD_HI655X_PMIC
+	tristate "HiSilicon Hi655X series PMU/Codec IC"
+	depends on ARCH_HISI || COMPILE_TEST
+	depends on OF
+	select MFD_CORE
+	select REGMAP_MMIO
+	select REGMAP_IRQ
+	help
+	  Select this option to enable Hisilicon hi655x series pmic driver.
+
 config HTC_EGPIO
 	bool "HTC EGPIO support"
 	depends on GPIOLIB && ARM
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 921a08dad9b3..42a66e19e191 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -196,6 +196,7 @@ obj-$(CONFIG_MFD_STW481X)	+= stw481x.o
 obj-$(CONFIG_MFD_IPAQ_MICRO)	+= ipaq-micro.o
 obj-$(CONFIG_MFD_MENF21BMC)	+= menf21bmc.o
 obj-$(CONFIG_MFD_HI6421_PMIC)	+= hi6421-pmic-core.o
+obj-$(CONFIG_MFD_HI655X_PMIC)   += hi655x-pmic.o
 obj-$(CONFIG_MFD_DLN2)		+= dln2.o
 obj-$(CONFIG_MFD_RT5033)	+= rt5033.o
 obj-$(CONFIG_MFD_SKY81452)	+= sky81452.o
diff --git a/drivers/mfd/hi655x-pmic.c b/drivers/mfd/hi655x-pmic.c
new file mode 100644
index 000000000000..05ddc7882362
--- /dev/null
+++ b/drivers/mfd/hi655x-pmic.c
@@ -0,0 +1,162 @@
+/*
+ * Device driver for MFD hi655x PMIC
+ *
+ * Copyright (c) 2016 Hisilicon.
+ *
+ * Authors:
+ * Chen Feng <puck.chen@hisilicon.com>
+ * Fei  Wang <w.f@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/hi655x-pmic.h>
+#include <linux/module.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+static const struct mfd_cell hi655x_pmic_devs[] = {
+	{ .name = "hi655x-regulator", },
+};
+
+static const struct regmap_irq hi655x_irqs[] = {
+	{ .reg_offset = 0, .mask = OTMP_D1R_INT },
+	{ .reg_offset = 0, .mask = VSYS_2P5_R_INT },
+	{ .reg_offset = 0, .mask = VSYS_UV_D3R_INT },
+	{ .reg_offset = 0, .mask = VSYS_6P0_D200UR_INT },
+	{ .reg_offset = 0, .mask = PWRON_D4SR_INT },
+	{ .reg_offset = 0, .mask = PWRON_D20F_INT },
+	{ .reg_offset = 0, .mask = PWRON_D20R_INT },
+	{ .reg_offset = 0, .mask = RESERVE_INT },
+};
+
+static const struct regmap_irq_chip hi655x_irq_chip = {
+	.name = "hi655x-pmic",
+	.irqs = hi655x_irqs,
+	.num_regs = 1,
+	.num_irqs = ARRAY_SIZE(hi655x_irqs),
+	.status_base = HI655X_IRQ_STAT_BASE,
+	.mask_base = HI655X_IRQ_MASK_BASE,
+};
+
+static struct regmap_config hi655x_regmap_config = {
+	.reg_bits = 32,
+	.reg_stride = HI655X_STRIDE,
+	.val_bits = 8,
+	.max_register = HI655X_BUS_ADDR(0xFFF),
+};
+
+static void hi655x_local_irq_clear(struct regmap *map)
+{
+	int i;
+
+	regmap_write(map, HI655X_ANA_IRQM_BASE, HI655X_IRQ_CLR);
+	for (i = 0; i < HI655X_IRQ_ARRAY; i++) {
+		regmap_write(map, HI655X_IRQ_STAT_BASE + i * HI655X_STRIDE,
+			     HI655X_IRQ_CLR);
+	}
+}
+
+static int hi655x_pmic_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct hi655x_pmic *pmic;
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	void __iomem *base;
+
+	pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL);
+	if (!pmic)
+		return -ENOMEM;
+	pmic->dev = dev;
+
+	pmic->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!pmic->res)
+		return -ENOENT;
+
+	base = devm_ioremap_resource(dev, pmic->res);
+	if (!base)
+		return -ENOMEM;
+
+	pmic->regmap = devm_regmap_init_mmio_clk(dev, NULL, base,
+						 &hi655x_regmap_config);
+
+	regmap_read(pmic->regmap, HI655X_BUS_ADDR(HI655X_VER_REG), &pmic->ver);
+	if ((pmic->ver < PMU_VER_START) || (pmic->ver > PMU_VER_END)) {
+		dev_warn(dev, "PMU version %d unsupported\n", pmic->ver);
+		return -EINVAL;
+	}
+
+	hi655x_local_irq_clear(pmic->regmap);
+
+	pmic->gpio = of_get_named_gpio(np, "pmic-gpios", 0);
+	if (!gpio_is_valid(pmic->gpio)) {
+		dev_err(dev, "Failed to get the pmic-gpios\n");
+		return -ENODEV;
+	}
+
+	ret = devm_gpio_request_one(dev, pmic->gpio, GPIOF_IN,
+				    "hi655x_pmic_irq");
+	if (ret < 0) {
+		dev_err(dev, "Failed to request gpio %d  ret = %d\n",
+			pmic->gpio, ret);
+		return ret;
+	}
+
+	ret = regmap_add_irq_chip(pmic->regmap, gpio_to_irq(pmic->gpio),
+				  IRQF_TRIGGER_LOW | IRQF_NO_SUSPEND, 0,
+				  &hi655x_irq_chip, &pmic->irq_data);
+	if (ret) {
+		dev_err(dev, "Failed to obtain 'hi655x_pmic_irq' %d\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, pmic);
+
+	ret = mfd_add_devices(dev, PLATFORM_DEVID_AUTO, hi655x_pmic_devs,
+			      ARRAY_SIZE(hi655x_pmic_devs), NULL, 0, NULL);
+	if (ret) {
+		dev_err(dev, "Failed to register device %d\n", ret);
+		regmap_del_irq_chip(gpio_to_irq(pmic->gpio), pmic->irq_data);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int hi655x_pmic_remove(struct platform_device *pdev)
+{
+	struct hi655x_pmic *pmic = platform_get_drvdata(pdev);
+
+	regmap_del_irq_chip(gpio_to_irq(pmic->gpio), pmic->irq_data);
+	mfd_remove_devices(&pdev->dev);
+	return 0;
+}
+
+static const struct of_device_id hi655x_pmic_match[] = {
+	{ .compatible = "hisilicon,hi655x-pmic", },
+	{},
+};
+
+static struct platform_driver hi655x_pmic_driver = {
+	.driver	= {
+		.name =	"hi655x-pmic",
+		.of_match_table = of_match_ptr(hi655x_pmic_match),
+	},
+	.probe  = hi655x_pmic_probe,
+	.remove = hi655x_pmic_remove,
+};
+module_platform_driver(hi655x_pmic_driver);
+
+MODULE_AUTHOR("Chen Feng <puck.chen@hisilicon.com>");
+MODULE_DESCRIPTION("Hisilicon hi655x PMIC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h
new file mode 100644
index 000000000000..dbbe9a644622
--- /dev/null
+++ b/include/linux/mfd/hi655x-pmic.h
@@ -0,0 +1,55 @@
+/*
+ * Device driver for regulators in hi655x IC
+ *
+ * Copyright (c) 2016 Hisilicon.
+ *
+ * Authors:
+ * Chen Feng <puck.chen@hisilicon.com>
+ * Fei  Wang <w.f@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __HI655X_PMIC_H
+#define __HI655X_PMIC_H
+
+/* Hi655x registers are mapped to memory bus in 4 bytes stride */
+#define HI655X_STRIDE                   4
+#define HI655X_BUS_ADDR(x)              ((x) << 2)
+
+#define HI655X_BITS                     8
+
+#define HI655X_NR_IRQ                   32
+
+#define HI655X_IRQ_STAT_BASE            (0x003 << 2)
+#define HI655X_IRQ_MASK_BASE            (0x007 << 2)
+#define HI655X_ANA_IRQM_BASE            (0x1b5 << 2)
+#define HI655X_IRQ_ARRAY                4
+#define HI655X_IRQ_MASK                 0xFF
+#define HI655X_IRQ_CLR                  0xFF
+#define HI655X_VER_REG                  0x00
+
+#define PMU_VER_START                   0x10
+#define PMU_VER_END                     0x38
+
+#define RESERVE_INT                     BIT(7)
+#define PWRON_D20R_INT                  BIT(6)
+#define PWRON_D20F_INT                  BIT(5)
+#define PWRON_D4SR_INT                  BIT(4)
+#define VSYS_6P0_D200UR_INT             BIT(3)
+#define VSYS_UV_D3R_INT                 BIT(2)
+#define VSYS_2P5_R_INT                  BIT(1)
+#define OTMP_D1R_INT                    BIT(0)
+
+struct hi655x_pmic {
+	struct resource *res;
+	struct device *dev;
+	struct regmap *regmap;
+	int gpio;
+	unsigned int ver;
+	struct regmap_irq_chip_data *irq_data;
+};
+
+#endif
-- 
cgit v1.2.3


From 94c6825e0ff75829207af6246782811b7c7af2c0 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Sun, 17 Apr 2016 17:08:40 +0300
Subject: net/mlx5_core: Use tasklet for user-space CQ completion events

Previously, we've fired all our completion callbacks straight from
our ISR.

Some of those callbacks were lightweight (for example, mlx5 Ethernet
napi callbacks), but some of them did more work (for example,
the user-space RDMA stack uverbs' completion handler). Besides that,
doing more than the minimal work in ISR is generally considered wrong,
it could even lead to a hard lockup of the system. Since when a lot
of completion events are generated by the hardware, the loop over
those events could be so long, that we'll get into a hard lockup by
the system watchdog.

In order to avoid that, add a new way of invoking completion events
callbacks. In the interrupt itself, we add the CQs which receive
completion event to a per-EQ list and schedule a tasklet. In the
tasklet context we loop over all the CQs in the list and invoke the
user callback.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c       | 59 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 12 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 17 +++++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 +
 include/linux/mlx5/cq.h                            |  5 ++
 include/linux/mlx5/driver.h                        | 10 ++++
 6 files changed, 104 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index b51e42d6fbec..873a631ad155 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -39,6 +39,53 @@
 #include <linux/mlx5/cq.h>
 #include "mlx5_core.h"
 
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx5_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data;
+	struct mlx5_core_cq *mcq;
+	struct mlx5_core_cq *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list,
+				 tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		if (atomic_dec_and_test(&mcq->refcount))
+			complete(&mcq->free);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+{
+	unsigned long flags;
+	struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		atomic_inc(&cq->refcount);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 {
 	struct mlx5_core_cq *cq;
@@ -96,6 +143,13 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	struct mlx5_create_cq_mbox_out out;
 	struct mlx5_destroy_cq_mbox_in din;
 	struct mlx5_destroy_cq_mbox_out dout;
+	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
+			   c_eqn);
+	struct mlx5_eq *eq;
+
+	eq = mlx5_eqn2eq(dev, eqn);
+	if (IS_ERR(eq))
+		return PTR_ERR(eq);
 
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ);
 	memset(&out, 0, sizeof(out));
@@ -111,6 +165,11 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	cq->arm_sn     = 0;
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
+	if (!cq->comp)
+		cq->comp = mlx5_add_cq_to_tasklet;
+	/* assuming CQ will be deleted before the EQ */
+	cq->tasklet_ctx.priv = &eq->tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
 
 	spin_lock_irq(&table->lock);
 	err = radix_tree_insert(&table->tree, cq->cqn, cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 18fccec72c5d..0e30602ef76d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -202,7 +202,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 	struct mlx5_eqe *eqe;
 	int eqes_found = 0;
 	int set_ci = 0;
-	u32 cqn;
+	u32 cqn = -1;
 	u32 rsn;
 	u8 port;
 
@@ -320,6 +320,9 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 
 	eq_update_ci(eq, 1);
 
+	if (cqn != -1)
+		tasklet_schedule(&eq->tasklet_ctx.task);
+
 	return eqes_found;
 }
 
@@ -403,6 +406,12 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	if (err)
 		goto err_irq;
 
+	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+	spin_lock_init(&eq->tasklet_ctx.lock);
+	tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+		     (unsigned long)&eq->tasklet_ctx);
+
 	/* EQs are created in ARMED state
 	 */
 	eq_update_ci(eq, 1);
@@ -436,6 +445,7 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
 	synchronize_irq(eq->irqn);
+	tasklet_disable(&eq->tasklet_ctx.task);
 	mlx5_buf_free(dev, &eq->buf);
 
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6892746fd10d..aa98d0234bd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -660,6 +660,23 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 }
 EXPORT_SYMBOL(mlx5_vector2eqn);
 
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq;
+
+	spin_lock(&table->lock);
+	list_for_each_entry(eq, &table->comp_eqs_list, list)
+		if (eq->eqn == eqn) {
+			spin_unlock(&table->lock);
+			return eq;
+		}
+
+	spin_unlock(&table->lock);
+
+	return ERR_PTR(-ENOENT);
+}
+
 static void free_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 0b0b226c789e..f0d87046af8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -100,6 +100,8 @@ int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
 int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
 cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev);
 u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx);
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
+void mlx5_cq_tasklet_cb(unsigned long data);
 
 void mlx5e_init(void);
 void mlx5e_cleanup(void);
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index b2c9fada8eac..2be976dd4966 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -53,6 +53,11 @@ struct mlx5_core_cq {
 	unsigned		arm_sn;
 	struct mlx5_rsc_debug	*dbg;
 	int			pid;
+	struct {
+		struct list_head list;
+		void (*comp)(struct mlx5_core_cq *);
+		void		*priv;
+	} tasklet_ctx;
 };
 
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 369c837d40f5..5a41f9003941 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/radix-tree.h>
+#include <linux/interrupt.h>
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
@@ -304,6 +305,14 @@ struct mlx5_buf {
 	u8			page_shift;
 };
 
+struct mlx5_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
 struct mlx5_eq {
 	struct mlx5_core_dev   *dev;
 	__be32 __iomem	       *doorbell;
@@ -317,6 +326,7 @@ struct mlx5_eq {
 	struct list_head	list;
 	int			index;
 	struct mlx5_rsc_debug	*dbg;
+	struct mlx5_eq_tasklet	tasklet_ctx;
 };
 
 struct mlx5_core_psv {
-- 
cgit v1.2.3


From dd1a4cc1fbdf516bb38ca31b65c76e720d414d0d Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 4 May 2016 14:09:44 -0500
Subject: KVM: split kvm_vcpu_wake_up from kvm_vcpu_kick
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AVIC has a use for kvm_vcpu_wake_up.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bbcd921d7cb0..b1fa8f11c95b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -657,6 +657,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7e22998a1ceb..dd4ac9d9e8f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2076,13 +2076,8 @@ out:
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
 #ifndef CONFIG_S390
-/*
- * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
- */
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
-	int me;
-	int cpu = vcpu->cpu;
 	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
@@ -2091,6 +2086,18 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 	}
 
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
+
+/*
+ * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+ */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me;
+	int cpu = vcpu->cpu;
+
+	kvm_vcpu_wake_up(vcpu);
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 		if (kvm_arch_vcpu_should_kick(vcpu))
-- 
cgit v1.2.3


From 45c04704e467fffe3525205454d9627325dae308 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 18 May 2016 16:19:01 +0300
Subject: ASoC: twl6040: Disconnect AUX output pads on digital mute

Disconnect also the path to AUXL from the HF path during digital_mute to
avoid pop noise leakage to Line-out pads.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/twl6040.h | 1 +
 sound/soc/codecs/twl6040.c  | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h
index 8f9fc3d26e6d..8e95cd87cd74 100644
--- a/include/linux/mfd/twl6040.h
+++ b/include/linux/mfd/twl6040.h
@@ -134,6 +134,7 @@
 #define TWL6040_HFDACENA		(1 << 0)
 #define TWL6040_HFPGAENA		(1 << 1)
 #define TWL6040_HFDRVENA		(1 << 4)
+#define TWL6040_HFSWENA			(1 << 6)
 
 /* VIBCTLL/R (0x18/0x1A) fields */
 
diff --git a/sound/soc/codecs/twl6040.c b/sound/soc/codecs/twl6040.c
index bc3de2e844e6..d1e3a932cbf3 100644
--- a/sound/soc/codecs/twl6040.c
+++ b/sound/soc/codecs/twl6040.c
@@ -983,9 +983,9 @@ static void twl6040_mute_path(struct snd_soc_codec *codec, enum twl6040_dai_id i
 		if (mute) {
 			/* Power down drivers and DACs */
 			hflctl &= ~(TWL6040_HFDACENA | TWL6040_HFPGAENA |
-				    TWL6040_HFDRVENA);
+				    TWL6040_HFDRVENA | TWL6040_HFSWENA);
 			hfrctl &= ~(TWL6040_HFDACENA | TWL6040_HFPGAENA |
-				    TWL6040_HFDRVENA);
+				    TWL6040_HFDRVENA | TWL6040_HFSWENA);
 		}
 
 		twl6040_reg_write(twl6040, TWL6040_REG_HFLCTL, hflctl);
-- 
cgit v1.2.3


From 0a70bd43053331d99881211e1d09f32de531432f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 24 Feb 2016 14:02:11 -0800
Subject: dax: enable dax in the presence of known media errors (badblocks)

1/ If a mapping overlaps a bad sector fail the request.

2/ Do not opportunistically report more dax-capable capacity than is
   requested when errors present.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
[vishal: fix a conflict with system RAM collision patches]
[vishal: add a 'size' parameter to ->direct_access]
[vishal: fix a conflict with DAX alignment check patches]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 arch/powerpc/sysdev/axonram.c |  2 +-
 drivers/block/brd.c           |  2 +-
 drivers/nvdimm/pmem.c         | 10 +++++++++-
 drivers/s390/block/dcssblk.c  |  4 ++--
 fs/block_dev.c                | 13 +------------
 include/linux/blkdev.h        |  2 +-
 6 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 0d112b94d91d..ff75d70f7285 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void __pmem **kaddr, pfn_t *pfn)
+		       void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
 	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 51a071e32221..c04bd9bc39fd 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-			void __pmem **kaddr, pfn_t *pfn)
+			void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f798899338ed..c447579bd853 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -182,14 +182,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void __pmem **kaddr, pfn_t *pfn)
+		      void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 	resource_size_t offset = sector * 512 + pmem->data_offset;
 
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+		return -EIO;
 	*kaddr = pmem->virt_addr + offset;
 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
+	/*
+	 * If badblocks are present, limit known good range to the
+	 * requested range.
+	 */
+	if (unlikely(pmem->bb.count))
+		return size;
 	return pmem->size - pmem->pfn_pad - offset;
 }
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 1bce9cf51b1e..6ac33984bc0f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
 						struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-			 void __pmem **kaddr, pfn_t *pfn);
+			 void __pmem **kaddr, pfn_t *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -883,7 +883,7 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void __pmem **kaddr, pfn_t *pfn)
+			void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
 	unsigned long offset, dev_sz;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8477d4501b1e..45839b27972c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,7 +29,6 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
-#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -501,7 +500,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 	sector += get_start_sect(bdev);
 	if (sector % (PAGE_SIZE / 512))
 		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
 	if (!avail)
 		return -ERANGE;
 	if (avail > 0 && avail & ~PAGE_MASK)
@@ -561,7 +560,6 @@ EXPORT_SYMBOL_GPL(bdev_dax_supported);
  */
 bool bdev_dax_capable(struct block_device *bdev)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	struct blk_dax_ctl dax = {
 		.size = PAGE_SIZE,
 	};
@@ -577,15 +575,6 @@ bool bdev_dax_capable(struct block_device *bdev)
 	if (bdev_direct_access(bdev, &dax) < 0)
 		return false;
 
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
 	return true;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 27cbefe8c985..cf7c13c2c38d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1668,7 +1668,7 @@ struct block_device_operations {
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-			pfn_t *);
+			pfn_t *, long);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
-- 
cgit v1.2.3


From 3dc29161070ab14d065554c0ad58988ab77a7bfd Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Mar 2016 11:20:41 -0600
Subject: dax: use sb_issue_zerout instead of calling dax_clear_sectors

dax_clear_sectors() cannot handle poisoned blocks.  These must be
zeroed using the BIO interface instead.  Convert ext2 and XFS to use
only sb_issue_zerout().

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
[vishal: Also remove the dax_clear_sectors function entirely]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/dax.c               | 32 --------------------------------
 fs/ext2/inode.c        |  8 ++++----
 fs/xfs/xfs_bmap_util.c | 15 ++++-----------
 include/linux/dax.h    |  1 -
 4 files changed, 8 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index d602410d8e52..0abbbb62981e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -87,38 +87,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 	return page;
 }
 
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
-	struct blk_dax_ctl dax = {
-		.sector = _sector,
-		.size = _size,
-	};
-
-	might_sleep();
-	do {
-		long count, sz;
-
-		count = dax_map_atomic(bdev, &dax);
-		if (count < 0)
-			return count;
-		sz = min_t(long, count, SZ_128K);
-		clear_pmem(dax.addr, sz);
-		dax.size -= sz;
-		dax.sector += sz / 512;
-		dax_unmap_atomic(bdev, &dax);
-		cond_resched();
-	} while (dax.size);
-
-	wmb_pmem();
-	return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
 static bool buffer_written(struct buffer_head *bh)
 {
 	return buffer_mapped(bh) && !buffer_unwritten(bh);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9a14af3b1a69..17cbd6b696f2 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/dax.h>
+#include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
@@ -737,10 +738,9 @@ static int ext2_get_blocks(struct inode *inode,
 		 * so that it's not found by another thread before it's
 		 * initialised
 		 */
-		err = dax_clear_sectors(inode->i_sb->s_bdev,
-				le32_to_cpu(chain[depth-1].key) <<
-				(inode->i_blkbits - 9),
-				count << inode->i_blkbits);
+		err = sb_issue_zeroout(inode->i_sb,
+				le32_to_cpu(chain[depth-1].key), count,
+				GFP_NOFS);
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3b6309865c65..930ac6a17ce3 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -72,18 +72,11 @@ xfs_zero_extent(
 	struct xfs_mount *mp = ip->i_mount;
 	xfs_daddr_t	sector = xfs_fsb_to_db(ip, start_fsb);
 	sector_t	block = XFS_BB_TO_FSBT(mp, sector);
-	ssize_t		size = XFS_FSB_TO_B(mp, count_fsb);
-
-	if (IS_DAX(VFS_I(ip)))
-		return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
-				sector, size);
-
-	/*
-	 * let the block layer decide on the fastest method of
-	 * implementing the zeroing.
-	 */
-	return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
 
+	return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+		block << (mp->m_super->s_blocksize_bits - 9),
+		count_fsb << (mp->m_super->s_blocksize_bits - 9),
+		GFP_NOFS, true);
 }
 
 /*
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7c45ac7ea1d1..7f853ffaa987 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,7 +7,6 @@
 
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
-- 
cgit v1.2.3


From 679c8bd3b29428e736eabb7fc66a978f312f0c86 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 May 2016 10:47:04 +0200
Subject: dax: export a low-level __dax_zero_page_range helper

This allows XFS to perform zeroing using the iomap infrastructure and
avoid buffer heads.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[vishal: fix conflicts with dax-error-handling]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/dax.c            | 35 ++++++++++++++++++++---------------
 include/linux/dax.h |  7 +++++++
 2 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0abbbb62981e..651d4b18ac29 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -947,6 +947,23 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+		unsigned int offset, unsigned int length)
+{
+	struct blk_dax_ctl dax = {
+		.sector		= sector,
+		.size		= PAGE_SIZE,
+	};
+
+	if (dax_map_atomic(bdev, &dax) < 0)
+		return PTR_ERR(dax.addr);
+	clear_pmem(dax.addr + offset, length);
+	wmb_pmem();
+	dax_unmap_atomic(bdev, &dax);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
 /**
  * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
@@ -982,23 +999,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 	err = get_block(inode, index, &bh, 0);
-	if (err < 0)
+	if (err < 0 || !buffer_written(&bh))
 		return err;
-	if (buffer_written(&bh)) {
-		struct block_device *bdev = bh.b_bdev;
-		struct blk_dax_ctl dax = {
-			.sector = to_sector(&bh, inode),
-			.size = PAGE_SIZE,
-		};
 
-		if (dax_map_atomic(bdev, &dax) < 0)
-			return PTR_ERR(dax.addr);
-		clear_pmem(dax.addr + offset, length);
-		wmb_pmem();
-		dax_unmap_atomic(bdev, &dax);
-	}
-
-	return 0;
+	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+			offset, length);
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7f853ffaa987..90fbc99e5313 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,12 +14,19 @@ int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+		unsigned int offset, unsigned int length);
 #else
 static inline struct page *read_dax_sector(struct block_device *bdev,
 		sector_t n)
 {
 	return ERR_PTR(-ENXIO);
 }
+static inline int __dax_zero_page_range(struct block_device *bdev,
+		sector_t sector, unsigned int offset, unsigned int length)
+{
+	return -ENXIO;
+}
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-- 
cgit v1.2.3


From 348e967ab07c96a9e7a6a194812254a8df2045c0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:15 +0200
Subject: dax: Make huge page handling depend of CONFIG_BROKEN

Currently the handling of huge pages for DAX is racy. For example the
following can happen:

CPU0 (THP write fault)			CPU1 (normal read fault)

__dax_pmd_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
					get_block(inode, block, &bh, 0)
					  -> not mapped
  if (!buffer_mapped(&bh) && write)
    get_block(inode, block, &bh, 1) -> allocates blocks
  truncate_pagecache_range(inode, lstart, lend);
					dax_load_hole();

This results in data corruption since process on CPU1 won't see changes
into the file done by CPU0.

The race can happen even if two normal faults race however with THP the
situation is even worse because the two faults don't operate on the same
entries in the radix tree and we want to use these entries for
serialization. So make THP support in DAX code depend on CONFIG_BROKEN
for now.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/Kconfig          | 1 +
 fs/dax.c            | 2 +-
 include/linux/dax.h | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59c18e6..b8fcb416be72 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -52,6 +52,7 @@ config FS_DAX_PMD
 	depends on FS_DAX
 	depends on ZONE_DEVICE
 	depends on TRANSPARENT_HUGEPAGE
+	depends on BROKEN
 
 endif # BLOCK
 
diff --git a/fs/dax.c b/fs/dax.c
index bdad05213e4b..0433a2b5e484 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -675,7 +675,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 /*
  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
  * more often than one might expect in the below function.
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 90fbc99e5313..72dc81de3ddb 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -29,7 +29,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 				unsigned int flags, get_block_t);
 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-- 
cgit v1.2.3


From e804315dd0f574b56155c5a2406ab5e0318104f7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:16 +0200
Subject: dax: Define DAX lock bit for radix tree exceptional entry

We will use lowest available bit in the radix tree exceptional entry for
locking of the entry. Define it. Also clean up definitions of DAX entry
type bits in DAX exceptional entries to use defined constants instead of
hardcoding numbers and cleanup checking of these bits to not rely on how
other bits in the entry are set.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 17 +++++++++++------
 include/linux/dax.h |  3 +++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0433a2b5e484..351afd3cf8be 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,14 +32,19 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+		RADIX_TREE_EXCEPTIONAL_ENTRY))
 
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 72dc81de3ddb..70600b63083f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -5,6 +5,9 @@
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 
+/* We use lowest available exceptional entry bit for locking */
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-- 
cgit v1.2.3


From 4f622938a5e2b7f1374ffb1e5fc212744898f513 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:17 +0200
Subject: dax: Allow DAX code to replace exceptional entries

Currently we forbid page_cache_tree_insert() to replace exceptional radix
tree entries for DAX inodes. However to make DAX faults race free we will
lock radix tree entries and when hole is created, we need to replace
such locked radix tree entry with a hole page. So modify
page_cache_tree_insert() to allow that.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 include/linux/dax.h |  1 +
 mm/filemap.c        | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 70600b63083f..aa148937bb3f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
 /* We use lowest available exceptional entry bit for locking */
diff --git a/mm/filemap.c b/mm/filemap.c
index f2479af09da9..dfe55c2cfb34 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -597,14 +597,21 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		if (!radix_tree_exceptional_entry(p))
 			return -EEXIST;
 
-		if (WARN_ON(dax_mapping(mapping)))
-			return -EINVAL;
-
-		if (shadowp)
-			*shadowp = p;
 		mapping->nrexceptional--;
-		if (node)
-			workingset_node_shadows_dec(node);
+		if (!dax_mapping(mapping)) {
+			if (shadowp)
+				*shadowp = p;
+			if (node)
+				workingset_node_shadows_dec(node);
+		} else {
+			/* DAX can replace empty locked entry with a hole */
+			WARN_ON_ONCE(p !=
+				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+					 RADIX_DAX_ENTRY_LOCK));
+			/* DAX accounts exceptional entries as normal pages */
+			if (node)
+				workingset_node_pages_dec(node);
+		}
 	}
 	radix_tree_replace_slot(slot, page);
 	mapping->nrpages++;
-- 
cgit v1.2.3


From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:18 +0200
Subject: dax: New fault locking

Currently DAX page fault locking is racy.

CPU0 (write fault)		CPU1 (read fault)

__dax_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
				  get_block(inode, block, &bh, 0)
				    -> not mapped
  if (!buffer_mapped(&bh))
    if (vmf->flags & FAULT_FLAG_WRITE)
      get_block(inode, block, &bh, 1) -> allocates blocks
  if (page) -> no
				  if (!buffer_mapped(&bh))
				    if (vmf->flags & FAULT_FLAG_WRITE) {
				    } else {
				      dax_load_hole();
				    }
  dax_insert_mapping()

And we are in a situation where we fail in dax_radix_entry() with -EIO.

Another problem with the current DAX page fault locking is that there is
no race-free way to clear dirty tag in the radix tree. We can always
end up with clean radix tree and dirty data in CPU cache.

We fix the first problem by introducing locking of exceptional radix
tree entries in DAX mappings acting very similarly to page lock and thus
synchronizing properly faults against the same mapping index. The same
lock can later be used to avoid races when clearing radix tree dirty
tag.

Reviewed-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 553 ++++++++++++++++++++++++++++++++++++++--------------
 include/linux/dax.h |   3 +
 mm/filemap.c        |   9 +-
 mm/truncate.c       |  62 +++---
 4 files changed, 447 insertions(+), 180 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 351afd3cf8be..f43c3d806fb6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -46,6 +46,30 @@
 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
 		RADIX_TREE_EXCEPTIONAL_ENTRY))
 
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+	int i;
+
+	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+		init_waitqueue_head(wait_table + i);
+	return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+					      pgoff_t index)
+{
+	unsigned long hash = hash_long((unsigned long)mapping ^ index,
+				       DAX_WAIT_TABLE_BITS);
+	return wait_table + hash;
+}
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -267,6 +291,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
+/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+	struct address_space *mapping;
+	unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+	wait_queue_t wait;
+	struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+				       int sync, void *keyp)
+{
+	struct exceptional_entry_key *key = keyp;
+	struct wait_exceptional_entry_queue *ewait =
+		container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+	if (key->mapping != ewait->key.mapping ||
+	    key->index != ewait->key.index)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+	return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+	entry |= RADIX_DAX_ENTRY_LOCK;
+	radix_tree_replace_slot(slot, (void *)entry);
+	return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+	radix_tree_replace_slot(slot, (void *)entry);
+	return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+					pgoff_t index, void ***slotp)
+{
+	void *ret, **slot;
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+	ewait.key.mapping = mapping;
+	ewait.key.index = index;
+
+	for (;;) {
+		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+					  &slot);
+		if (!ret || !radix_tree_exceptional_entry(ret) ||
+		    !slot_locked(mapping, slot)) {
+			if (slotp)
+				*slotp = slot;
+			return ret;
+		}
+		prepare_to_wait_exclusive(wq, &ewait.wait,
+					  TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mapping->tree_lock);
+		schedule();
+		finish_wait(wq, &ewait.wait);
+		spin_lock_irq(&mapping->tree_lock);
+	}
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *ret, **slot;
+
+restart:
+	spin_lock_irq(&mapping->tree_lock);
+	ret = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* No entry for given index? Make sure radix tree is big enough. */
+	if (!ret) {
+		int err;
+
+		spin_unlock_irq(&mapping->tree_lock);
+		err = radix_tree_preload(
+				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+		if (err)
+			return ERR_PTR(err);
+		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+			       RADIX_DAX_ENTRY_LOCK);
+		spin_lock_irq(&mapping->tree_lock);
+		err = radix_tree_insert(&mapping->page_tree, index, ret);
+		radix_tree_preload_end();
+		if (err) {
+			spin_unlock_irq(&mapping->tree_lock);
+			/* Someone already created the entry? */
+			if (err == -EEXIST)
+				goto restart;
+			return ERR_PTR(err);
+		}
+		/* Good, we have inserted empty locked entry into the tree. */
+		mapping->nrexceptional++;
+		spin_unlock_irq(&mapping->tree_lock);
+		return ret;
+	}
+	/* Normal page in radix tree? */
+	if (!radix_tree_exceptional_entry(ret)) {
+		struct page *page = ret;
+
+		get_page(page);
+		spin_unlock_irq(&mapping->tree_lock);
+		lock_page(page);
+		/* Page got truncated? Retry... */
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			put_page(page);
+			goto restart;
+		}
+		return page;
+	}
+	ret = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+				   pgoff_t index, bool wake_all)
+{
+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+	/*
+	 * Checking for locked entry and prepare_to_wait_exclusive() happens
+	 * under mapping->tree_lock, ditto for entry handling in our callers.
+	 * So at this point all tasks that could have seen our entry locked
+	 * must be in the waitqueue and the following check will see them.
+	 */
+	if (waitqueue_active(wq)) {
+		struct exceptional_entry_key key;
+
+		key.mapping = mapping;
+		key.index = index;
+		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+	}
+}
+
+static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *ret, **slot;
+
+	spin_lock_irq(&mapping->tree_lock);
+	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+			 !slot_locked(mapping, slot))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return;
+	}
+	unlock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+				     pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry)) {
+		unlock_page(entry);
+		put_page(entry);
+	} else {
+		unlock_mapping_entry(mapping, index);
+	}
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+				       pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry))
+		return;
+
+	/* We have to wake up next waiter for the radix tree entry lock */
+	dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *entry;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	/*
+	 * This gets called from truncate / punch_hole path. As such, the caller
+	 * must hold locks protecting against concurrent modifications of the
+	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
+	 * caller has seen exceptional entry for this index, we better find it
+	 * at that index as well...
+	 */
+	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	radix_tree_delete(&mapping->page_tree, index);
+	mapping->nrexceptional--;
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, true);
+
+	return 1;
+}
+
 /*
  * The user has performed a load from a hole in the file.  Allocating
  * a new page in the file would cause excessive storage usage for
@@ -275,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-							struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+			 struct vm_fault *vmf)
 {
-	if (!page)
-		page = find_or_create_page(mapping, vmf->pgoff,
-						GFP_KERNEL | __GFP_ZERO);
-	if (!page)
-		return VM_FAULT_OOM;
+	struct page *page;
+
+	/* Hole page already exists? Return it...  */
+	if (!radix_tree_exceptional_entry(entry)) {
+		vmf->page = entry;
+		return VM_FAULT_LOCKED;
+	}
 
+	/* This will replace locked radix tree entry with a hole page */
+	page = find_or_create_page(mapping, vmf->pgoff,
+				   vmf->gfp_mask | __GFP_ZERO);
+	if (!page) {
+		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+		return VM_FAULT_OOM;
+	}
 	vmf->page = page;
 	return VM_FAULT_LOCKED;
 }
@@ -307,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 	return 0;
 }
 
-#define NO_SECTOR -1
 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-		sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+				      struct vm_fault *vmf,
+				      void *entry, sector_t sector)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	pgoff_t pmd_index = DAX_PMD_INDEX(index);
-	int type, error = 0;
-	void *entry;
+	int error = 0;
+	bool hole_fill = false;
+	void *new_entry;
+	pgoff_t index = vmf->pgoff;
 
-	WARN_ON_ONCE(pmd_entry && !dirty);
-	if (dirty)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	spin_lock_irq(&mapping->tree_lock);
-
-	entry = radix_tree_lookup(page_tree, pmd_index);
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-		index = pmd_index;
-		goto dirty;
+	/* Replacing hole page with block mapping? */
+	if (!radix_tree_exceptional_entry(entry)) {
+		hole_fill = true;
+		/*
+		 * Unmap the page now before we remove it from page cache below.
+		 * The page is locked so it cannot be faulted in again.
+		 */
+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+				    PAGE_SIZE, 0);
+		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+		if (error)
+			return ERR_PTR(error);
 	}
 
-	entry = radix_tree_lookup(page_tree, index);
-	if (entry) {
-		type = RADIX_DAX_TYPE(entry);
-		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-					type != RADIX_DAX_PMD)) {
-			error = -EIO;
+	spin_lock_irq(&mapping->tree_lock);
+	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+		       RADIX_DAX_ENTRY_LOCK);
+	if (hole_fill) {
+		__delete_from_page_cache(entry, NULL);
+		/* Drop pagecache reference */
+		put_page(entry);
+		error = radix_tree_insert(page_tree, index, new_entry);
+		if (error) {
+			new_entry = ERR_PTR(error);
 			goto unlock;
 		}
+		mapping->nrexceptional++;
+	} else {
+		void **slot;
+		void *ret;
 
-		if (!pmd_entry || type == RADIX_DAX_PMD)
-			goto dirty;
-
-		/*
-		 * We only insert dirty PMD entries into the radix tree.  This
-		 * means we don't need to worry about removing a dirty PTE
-		 * entry and inserting a clean PMD entry, thus reducing the
-		 * range we would flush with a follow-up fsync/msync call.
-		 */
-		radix_tree_delete(&mapping->page_tree, index);
-		mapping->nrexceptional--;
-	}
-
-	if (sector == NO_SECTOR) {
-		/*
-		 * This can happen during correct operation if our pfn_mkwrite
-		 * fault raced against a hole punch operation.  If this
-		 * happens the pte that was hole punched will have been
-		 * unmapped and the radix tree entry will have been removed by
-		 * the time we are called, but the call will still happen.  We
-		 * will return all the way up to wp_pfn_shared(), where the
-		 * pte_same() check will fail, eventually causing page fault
-		 * to be retried by the CPU.
-		 */
-		goto unlock;
+		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+		WARN_ON_ONCE(ret != entry);
+		radix_tree_replace_slot(slot, new_entry);
 	}
-
-	error = radix_tree_insert(page_tree, index,
-			RADIX_DAX_ENTRY(sector, pmd_entry));
-	if (error)
-		goto unlock;
-
-	mapping->nrexceptional++;
- dirty:
-	if (dirty)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  unlock:
 	spin_unlock_irq(&mapping->tree_lock);
-	return error;
+	if (hole_fill) {
+		radix_tree_preload_end();
+		/*
+		 * We don't need hole page anymore, it has been replaced with
+		 * locked radix tree entry now.
+		 */
+		if (mapping->a_ops->freepage)
+			mapping->a_ops->freepage(entry);
+		unlock_page(entry);
+		put_page(entry);
+	}
+	return new_entry;
 }
 
 static int dax_writeback_one(struct block_device *bdev,
@@ -503,17 +788,19 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+			struct buffer_head *bh, void **entryp,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	struct address_space *mapping = inode->i_mapping;
 	struct block_device *bdev = bh->b_bdev;
 	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
+		.sector = to_sector(bh, mapping->host),
 		.size = bh->b_size,
 	};
 	int error;
+	void *ret;
+	void *entry = *entryp;
 
 	i_mmap_lock_read(mapping);
 
@@ -523,16 +810,16 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 	dax_unmap_atomic(bdev, &dax);
 
-	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-			vmf->flags & FAULT_FLAG_WRITE);
-	if (error)
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+	if (IS_ERR(ret)) {
+		error = PTR_ERR(ret);
 		goto out;
+	}
+	*entryp = ret;
 
 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
-
  out:
 	i_mmap_unlock_read(mapping);
-
 	return error;
 }
 
@@ -552,7 +839,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *page;
+	void *entry;
 	struct buffer_head bh;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	unsigned blkbits = inode->i_blkbits;
@@ -561,6 +848,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	int error;
 	int major = 0;
 
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is supposed
+	 * to hold locks serializing us with truncate / punch hole so this is
+	 * a reliable test.
+	 */
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
@@ -570,40 +862,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
- repeat:
-	page = find_get_page(mapping, vmf->pgoff);
-	if (page) {
-		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-			put_page(page);
-			return VM_FAULT_RETRY;
-		}
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto repeat;
-		}
+	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	if (IS_ERR(entry)) {
+		error = PTR_ERR(entry);
+		goto out;
 	}
 
 	error = get_block(inode, block, &bh, 0);
 	if (!error && (bh.b_size < PAGE_SIZE))
 		error = -EIO;		/* fs corruption? */
 	if (error)
-		goto unlock_page;
-
-	if (!buffer_mapped(&bh) && !vmf->cow_page) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_SIZE))
-				error = -EIO;
-			if (error)
-				goto unlock_page;
-		} else {
-			return dax_load_hole(mapping, page, vmf);
-		}
-	}
+		goto unlock_entry;
 
 	if (vmf->cow_page) {
 		struct page *new_page = vmf->cow_page;
@@ -612,30 +881,37 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
-			goto unlock_page;
-		vmf->page = page;
-		if (!page)
+			goto unlock_entry;
+		if (!radix_tree_exceptional_entry(entry)) {
+			vmf->page = entry;
+		} else {
+			unlock_mapping_entry(mapping, vmf->pgoff);
 			i_mmap_lock_read(mapping);
+			vmf->page = NULL;
+		}
 		return VM_FAULT_LOCKED;
 	}
 
-	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
-		page = find_lock_page(mapping, vmf->pgoff);
-
-	if (page) {
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-							PAGE_SIZE, 0);
-		delete_from_page_cache(page);
-		unlock_page(page);
-		put_page(page);
-		page = NULL;
+	if (!buffer_mapped(&bh)) {
+		if (vmf->flags & FAULT_FLAG_WRITE) {
+			error = get_block(inode, block, &bh, 1);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			if (!error && (bh.b_size < PAGE_SIZE))
+				error = -EIO;
+			if (error)
+				goto unlock_entry;
+		} else {
+			return dax_load_hole(mapping, entry, vmf);
+		}
 	}
 
 	/* Filesystem should not return unwritten buffers to us! */
 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	error = dax_insert_mapping(inode, &bh, vma, vmf);
-
+	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
@@ -643,13 +919,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if ((error < 0) && (error != -EBUSY))
 		return VM_FAULT_SIGBUS | major;
 	return VM_FAULT_NOPAGE | major;
-
- unlock_page:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
 
@@ -718,7 +987,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct block_device *bdev;
 	pgoff_t size, pgoff;
 	sector_t block;
-	int error, result = 0;
+	int result = 0;
 	bool alloc = false;
 
 	/* dax pmd mappings require pfn_t_devmap() */
@@ -865,13 +1134,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		 * the write to insert a dirty entry.
 		 */
 		if (write) {
-			error = dax_radix_entry(mapping, pgoff, dax.sector,
-					true, true);
-			if (error) {
-				dax_pmd_dbg(&bh, address,
-						"PMD radix insertion failed");
-				goto fallback;
-			}
+			/*
+			 * We should insert radix-tree entry and dirty it here.
+			 * For now this is broken...
+			 */
 		}
 
 		dev_dbg(part_to_dev(bdev->bd_part),
@@ -931,23 +1197,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
-	int error;
-
-	/*
-	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-	 * RADIX_DAX_PTE entry already exists in the radix tree from a
-	 * previous call to __dax_fault().  We just want to look up that PTE
-	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
-	 * saves us from having to make a call to get_block() here to look
-	 * up the sector.
-	 */
-	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
-			true);
+	struct address_space *mapping = file->f_mapping;
+	void *entry;
+	pgoff_t index = vmf->pgoff;
 
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (error)
-		return VM_FAULT_SIGBUS;
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (!entry || !radix_tree_exceptional_entry(entry))
+		goto out;
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	put_unlocked_mapping_entry(mapping, index, entry);
+out:
+	spin_unlock_irq(&mapping->tree_lock);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index aa148937bb3f..756625c6d0dd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+				   pgoff_t index, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
diff --git a/mm/filemap.c b/mm/filemap.c
index dfe55c2cfb34..7b9a4b180cae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
 			return;
 
 	/*
-	 * Track node that only contains shadow entries.
+	 * Track node that only contains shadow entries. DAX mappings contain
+	 * no shadow entries and may contain other exceptional entries so skip
+	 * those.
 	 *
 	 * Avoid acquiring the list_lru lock if already tracked.  The
 	 * list_empty() test is safe as node->private_list is
 	 * protected by mapping->tree_lock.
 	 */
-	if (!workingset_node_pages(node) &&
+	if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
 	    list_empty(&node->private_list)) {
 		node->private_data = mapping;
 		list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			/* DAX accounts exceptional entries as normal pages */
 			if (node)
 				workingset_node_pages_dec(node);
+			/* Wakeup waiters for exceptional entry lock */
+			dax_wake_mapping_entry_waiter(mapping, page->index,
+						      false);
 		}
 	}
 	radix_tree_replace_slot(slot, page);
diff --git a/mm/truncate.c b/mm/truncate.c
index b00272810871..4064f8f53daa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
 	if (shmem_mapping(mapping))
 		return;
 
-	spin_lock_irq(&mapping->tree_lock);
-
 	if (dax_mapping(mapping)) {
-		if (radix_tree_delete_item(&mapping->page_tree, index, entry))
-			mapping->nrexceptional--;
-	} else {
-		/*
-		 * Regular page slots are stabilized by the page lock even
-		 * without the tree itself locked.  These unlocked entries
-		 * need verification under the tree lock.
-		 */
-		if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
-					&slot))
-			goto unlock;
-		if (*slot != entry)
-			goto unlock;
-		radix_tree_replace_slot(slot, NULL);
-		mapping->nrexceptional--;
-		if (!node)
-			goto unlock;
-		workingset_node_shadows_dec(node);
-		/*
-		 * Don't track node without shadow entries.
-		 *
-		 * Avoid acquiring the list_lru lock if already untracked.
-		 * The list_empty() test is safe as node->private_list is
-		 * protected by mapping->tree_lock.
-		 */
-		if (!workingset_node_shadows(node) &&
-		    !list_empty(&node->private_list))
-			list_lru_del(&workingset_shadow_nodes,
-					&node->private_list);
-		__radix_tree_delete_node(&mapping->page_tree, node);
+		dax_delete_mapping_entry(mapping, index);
+		return;
 	}
+	spin_lock_irq(&mapping->tree_lock);
+	/*
+	 * Regular page slots are stabilized by the page lock even
+	 * without the tree itself locked.  These unlocked entries
+	 * need verification under the tree lock.
+	 */
+	if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+				&slot))
+		goto unlock;
+	if (*slot != entry)
+		goto unlock;
+	radix_tree_replace_slot(slot, NULL);
+	mapping->nrexceptional--;
+	if (!node)
+		goto unlock;
+	workingset_node_shadows_dec(node);
+	/*
+	 * Don't track node without shadow entries.
+	 *
+	 * Avoid acquiring the list_lru lock if already untracked.
+	 * The list_empty() test is safe as node->private_list is
+	 * protected by mapping->tree_lock.
+	 */
+	if (!workingset_node_shadows(node) &&
+	    !list_empty(&node->private_list))
+		list_lru_del(&workingset_shadow_nodes,
+				&node->private_list);
+	__radix_tree_delete_node(&mapping->page_tree, node);
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 }
-- 
cgit v1.2.3


From bc2466e4257369d0ebee2b6265070d323343fa72 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:19 +0200
Subject: dax: Use radix tree entry lock to protect cow faults

When doing cow faults, we cannot directly fill in PTE as we do for other
faults as we rely on generic code to do proper accounting of the cowed page.
We also have no page to lock to protect against races with truncate as
other faults have and we need the protection to extend until the moment
generic code inserts cowed page into PTE thus at that point we have no
protection of fs-specific i_mmap_sem. So far we relied on using
i_mmap_lock for the protection however that is completely special to cow
faults. To make fault locking more uniform use DAX entry lock instead.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 12 +++++-------
 include/linux/dax.h |  7 +++++++
 include/linux/mm.h  |  7 +++++++
 mm/memory.c         | 38 ++++++++++++++++++--------------------
 4 files changed, 37 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index f43c3d806fb6..be74635e05a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -478,7 +478,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 	}
 }
 
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
 	void *ret, **slot;
 
@@ -501,7 +501,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
 		unlock_page(entry);
 		put_page(entry);
 	} else {
-		unlock_mapping_entry(mapping, index);
+		dax_unlock_mapping_entry(mapping, index);
 	}
 }
 
@@ -884,12 +884,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			goto unlock_entry;
 		if (!radix_tree_exceptional_entry(entry)) {
 			vmf->page = entry;
-		} else {
-			unlock_mapping_entry(mapping, vmf->pgoff);
-			i_mmap_lock_read(mapping);
-			vmf->page = NULL;
+			return VM_FAULT_LOCKED;
 		}
-		return VM_FAULT_LOCKED;
+		vmf->entry = entry;
+		return VM_FAULT_DAX_LOCKED;
 	}
 
 	if (!buffer_mapped(&bh)) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 756625c6d0dd..7bf12277c006 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -21,6 +21,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
@@ -29,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 {
 	return ERR_PTR(-ENXIO);
 }
+/* Shouldn't ever be called when dax is disabled. */
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+					    pgoff_t index)
+{
+	BUG();
+}
 static inline int __dax_zero_page_range(struct block_device *bdev,
 		sector_t sector, unsigned int offset, unsigned int length)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a55e5be0894f..0ef9dc720ec3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -299,6 +299,12 @@ struct vm_fault {
 					 * is set (which is also implied by
 					 * VM_FAULT_ERROR).
 					 */
+	void *entry;			/* ->fault handler can alternatively
+					 * return locked DAX entry. In that
+					 * case handler should return
+					 * VM_FAULT_DAX_LOCKED and fill in
+					 * entry here.
+					 */
 	/* for ->map_pages() only */
 	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
 					 * max_pgoff inclusive */
@@ -1084,6 +1090,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
+#define VM_FAULT_DAX_LOCKED 0x1000	/* ->fault has locked DAX entry */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
diff --git a/mm/memory.c b/mm/memory.c
index 93897f23cc11..f09cdb8d48fa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -63,6 +63,7 @@
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2785,7 +2786,8 @@ oom:
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 			pgoff_t pgoff, unsigned int flags,
-			struct page *cow_page, struct page **page)
+			struct page *cow_page, struct page **page,
+			void **entry)
 {
 	struct vm_fault vmf;
 	int ret;
@@ -2800,8 +2802,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
-	if (!vmf.page)
-		goto out;
+	if (ret & VM_FAULT_DAX_LOCKED) {
+		*entry = vmf.entry;
+		return ret;
+	}
 
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
@@ -2815,7 +2819,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
- out:
 	*page = vmf.page;
 	return ret;
 }
@@ -2987,7 +2990,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
@@ -3010,6 +3013,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	struct page *fault_page, *new_page;
+	void *fault_entry;
 	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pte_t *pte;
@@ -3027,26 +3031,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
+			 &fault_entry);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
 
-	if (fault_page)
+	if (!(ret & VM_FAULT_DAX_LOCKED))
 		copy_user_highpage(new_page, fault_page, address, vma);
 	__SetPageUptodate(new_page);
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
-		if (fault_page) {
+		if (!(ret & VM_FAULT_DAX_LOCKED)) {
 			unlock_page(fault_page);
 			put_page(fault_page);
 		} else {
-			/*
-			 * The fault handler has no page to lock, so it holds
-			 * i_mmap_lock for read to protect against truncate.
-			 */
-			i_mmap_unlock_read(vma->vm_file->f_mapping);
+			dax_unlock_mapping_entry(vma->vm_file->f_mapping,
+						 pgoff);
 		}
 		goto uncharge_out;
 	}
@@ -3054,15 +3056,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
-	if (fault_page) {
+	if (!(ret & VM_FAULT_DAX_LOCKED)) {
 		unlock_page(fault_page);
 		put_page(fault_page);
 	} else {
-		/*
-		 * The fault handler has no page to lock, so it holds
-		 * i_mmap_lock for read to protect against truncate.
-		 */
-		i_mmap_unlock_read(vma->vm_file->f_mapping);
+		dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
 	}
 	return ret;
 uncharge_out:
@@ -3082,7 +3080,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	int dirtied = 0;
 	int ret, tmp;
 
-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
-- 
cgit v1.2.3


From 35e481761cdc688dbee0ef552a13f49af8eba6cc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 19 May 2016 17:08:59 -0700
Subject: fsnotify: avoid spurious EMFILE errors from inotify_init()

Inotify instance is destroyed when all references to it are dropped.
That not only means that the corresponding file descriptor needs to be
closed but also that all corresponding instance marks are freed (as each
mark holds a reference to the inotify instance).  However marks are
freed only after SRCU period ends which can take some time and thus if
user rapidly creates and frees inotify instances, number of existing
inotify instances can exceed max_user_instances limit although from user
point of view there is always at most one existing instance.  Thus
inotify_init() returns EMFILE error which is hard to justify from user
point of view.  This problem is exposed by LTP inotify06 testcase on
some machines.

We fix the problem by making sure all group marks are properly freed
while destroying inotify instance.  We wait for SRCU period to end in
that path anyway since we have to make sure there is no event being
added to the instance while we are tearing down the instance.  So it
takes only some plumbing to allow for marks to be destroyed in that path
as well and not from a dedicated work item.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fsnotify.h             |  7 ++++
 fs/notify/group.c                | 17 ++++++---
 fs/notify/mark.c                 | 78 +++++++++++++++++++++++++++++++---------
 include/linux/fsnotify_backend.h |  2 --
 4 files changed, 81 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a857e7..0a3bc2cf192c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
 			       &mnt->mnt_root->d_lock);
 }
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62cb2854..3e2dd85be5dd 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
-	/* clear all inode marks for this group */
-	fsnotify_clear_marks_by_group(group);
+	/* clear all inode marks for this group, attach them to destroy_list */
+	fsnotify_detach_group_marks(group);
 
-	synchronize_srcu(&fsnotify_mark_srcu);
+	/*
+	 * Wait for fsnotify_mark_srcu period to end and free all marks in
+	 * destroy_list
+	 */
+	fsnotify_mark_destroy_list();
 
-	/* clear the notification queue of all events */
+	/*
+	 * Since we have waited for fsnotify_mark_srcu in
+	 * fsnotify_mark_destroy_list() there can be no outstanding event
+	 * notification against this group. So clearing the notification queue
+	 * of all events is reliable now.
+	 */
 	fsnotify_flush_notify(group);
 
 	/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d7d373..d3fea0bd89e2 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
 
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 }
 
 /*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
  */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_group *group = mark->group;
 
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	/* something else already called this function on this mark */
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
 		spin_unlock(&mark->lock);
-		return;
+		return false;
 	}
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	spin_unlock(&mark->lock);
 
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-	queue_delayed_work(system_unbound_wq, &reaper_work,
-				FSNOTIFY_REAPER_DELAY);
-
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
 	 * callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	 */
 	if (group->ops->freeing_mark)
 		group->ops->freeing_mark(mark, group);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+
+	return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+	if (__fsnotify_free_mark(mark)) {
+		queue_delayed_work(system_unbound_wq, &reaper_work,
+				   FSNOTIFY_REAPER_DELAY);
+	}
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 }
 
 /*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
  */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+	struct fsnotify_mark *mark;
+
+	while (1) {
+		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+		if (list_empty(&group->marks_list)) {
+			mutex_unlock(&group->mark_mutex);
+			break;
+		}
+		mark = list_first_entry(&group->marks_list,
+					struct fsnotify_mark, g_list);
+		fsnotify_get_mark(mark);
+		fsnotify_detach_mark(mark);
+		mutex_unlock(&group->mark_mutex);
+		__fsnotify_free_mark(mark);
+		fsnotify_put_mark(mark);
+	}
 }
 
 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	mark->free_mark = free_mark;
 }
 
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
 {
 	struct fsnotify_mark *mark, *next;
 	struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
 		fsnotify_put_mark(mark);
 	}
 }
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+	fsnotify_mark_destroy_list();
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1259e53d9296..29f917517299 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
-- 
cgit v1.2.3


From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:02 -0700
Subject: time: add missing implementation for timespec64_add_safe()

timespec64_add_safe() has been defined in time64.h for 64 bit systems.
But, 32 bit systems only have an extern function prototype defined.
Provide a definition for the above function.

The function will be necessary as part of y2038 changes.  struct
timespec is not y2038 safe.  All references to timespec will be replaced
by struct timespec64.  The function is meant to be a replacement for
timespec_add_safe().

The implementation is similar to timespec_add_safe().

Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h |  4 +---
 kernel/time/time.c     | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 367d5af899e8..1778937221bf 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n
 
 /*
  * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
+ * overflow. It will return TIME64_MAX in case of overflow.
  */
 extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 					 const struct timespec64 rhs);
 
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b612066..cb1f83eb5599 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 
 	return res;
 }
+
+#if __BITS_PER_LONG != 64
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+				const struct timespec64 rhs)
+{
+	struct timespec64 res;
+
+	set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+			lhs.tv_nsec + rhs.tv_nsec);
+
+	if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+		res.tv_sec = TIME64_MAX;
+		res.tv_nsec = 0;
+	}
+
+	return res;
+}
+
+#endif
-- 
cgit v1.2.3


From 766b9f928bd5b9b185d986d40355d1f143484136 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:05 -0700
Subject: fs: poll/select/recvmmsg: use timespec64 for timeout events

struct timespec is not y2038 safe.  Even though timespec might be
sufficient to represent timeouts, use struct timespec64 here as the plan
is to get rid of all timespec reference in the kernel.

The patch transitions the common functions: poll_select_set_timeout()
and select_estimate_accuracy() to use timespec64.  And, all the syscalls
that use these functions are transitioned in the same patch.

The restart block parameters for poll uses monotonic time.  Use
timespec64 here as well to assign timeout value.  This parameter in the
restart block need not change because this only holds the monotonic
timestamp at which timeout should occur.  And, unsigned long data type
should be big enough for this timestamp.

The system call interfaces will be handled in a separate series.

Compat interfaces need not change as timespec64 is an alias to struct
timespec on a 64 bit system.

Link: http://lkml.kernel.org/r/1461947989-21926-3-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c       | 12 +++++-----
 fs/select.c          | 67 +++++++++++++++++++++++++++++-----------------------
 include/linux/poll.h | 11 +++++----
 net/socket.c         |  8 ++++---
 4 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a74a2a52e0f..10db91218933 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
 	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
 }
 
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
 {
-	struct timespec now, ts = {
+	struct timespec64 now, ts = {
 		.tv_sec = ms / MSEC_PER_SEC,
 		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
 	};
 
-	ktime_get_ts(&now);
-	return timespec_add_safe(now, ts);
+	ktime_get_ts64(&now);
+	return timespec64_add_safe(now, ts);
 }
 
 /**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	ktime_t expires, *to = NULL;
 
 	if (timeout > 0) {
-		struct timespec end_time = ep_set_mstimeout(timeout);
+		struct timespec64 end_time = ep_set_mstimeout(timeout);
 
 		slack = select_estimate_accuracy(&end_time);
 		to = &expires;
-		*to = timespec_to_ktime(end_time);
+		*to = timespec64_to_ktime(end_time);
 	} else if (timeout == 0) {
 		/*
 		 * Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/select.c b/fs/select.c
index 869293988c2a..8ed9da50896a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
 
 #define MAX_SLACK	(100 * NSEC_PER_MSEC)
 
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
 {
 	long slack;
 	int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
 	return slack;
 }
 
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
 {
 	u64 ret;
-	struct timespec now;
+	struct timespec64 now;
 
 	/*
 	 * Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
 	if (rt_task(current))
 		return 0;
 
-	ktime_get_ts(&now);
-	now = timespec_sub(*tv, now);
+	ktime_get_ts64(&now);
+	now = timespec64_sub(*tv, now);
 	ret = __estimate_accuracy(&now);
 	if (ret < current->timer_slack_ns)
 		return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
 
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
- * @to:		pointer to timespec variable for the final timeout
+ * @to:		pointer to timespec64 variable for the final timeout
  * @sec:	seconds (from user space)
  * @nsec:	nanoseconds (from user space)
  *
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
  *
  * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
  */
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
 {
-	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
 
-	if (!timespec_valid(&ts))
+	if (!timespec64_valid(&ts))
 		return -EINVAL;
 
 	/* Optimize for the zero timeout value here */
 	if (!sec && !nsec) {
 		to->tv_sec = to->tv_nsec = 0;
 	} else {
-		ktime_get_ts(to);
-		*to = timespec_add_safe(*to, ts);
+		ktime_get_ts64(to);
+		*to = timespec64_add_safe(*to, ts);
 	}
 	return 0;
 }
 
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+				      void __user *p,
 				      int timeval, int ret)
 {
+	struct timespec64 rts64;
 	struct timespec rts;
 	struct timeval rtv;
 
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 	if (!end_time->tv_sec && !end_time->tv_nsec)
 		return ret;
 
-	ktime_get_ts(&rts);
-	rts = timespec_sub(*end_time, rts);
-	if (rts.tv_sec < 0)
-		rts.tv_sec = rts.tv_nsec = 0;
+	ktime_get_ts64(&rts64);
+	rts64 = timespec64_sub(*end_time, rts64);
+	if (rts64.tv_sec < 0)
+		rts64.tv_sec = rts64.tv_nsec = 0;
+
+	rts = timespec64_to_timespec(rts64);
 
 	if (timeval) {
 		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
 			memset(&rtv, 0, sizeof(rtv));
-		rtv.tv_sec = rts.tv_sec;
-		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+		rtv.tv_sec = rts64.tv_sec;
+		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
 
 		if (!copy_to_user(p, &rtv, sizeof(rtv)))
 			return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		 * pointer to the expiry value.
 		 */
 		if (end_time && !to) {
-			expire = timespec_to_ktime(*end_time);
+			expire = timespec64_to_ktime(*end_time);
 			to = &expire;
 		}
 
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
  * I'm trying ERESTARTNOHAND which restart only when you want to.
  */
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, struct timespec *end_time)
+			   fd_set __user *exp, struct timespec64 *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -622,7 +626,7 @@ out_nofds:
 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
 		fd_set __user *, exp, struct timeval __user *, tvp)
 {
-	struct timespec end_time, *to = NULL;
+	struct timespec64 end_time, *to = NULL;
 	struct timeval tv;
 	int ret;
 
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts, end_time, *to = NULL;
+	struct timespec ts;
+	struct timespec64 ts64, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
+		ts64 = timespec_to_timespec64(ts);
 
 		to = &end_time;
-		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 }
 
 static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
-		   struct timespec *end_time)
+		   struct timespec64 *end_time)
 {
 	poll_table* pt = &wait->pt;
 	ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 		 * pointer to the expiry value.
 		 */
 		if (end_time && !to) {
-			expire = timespec_to_ktime(*end_time);
+			expire = timespec64_to_ktime(*end_time);
 			to = &expire;
 		}
 
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 			sizeof(struct pollfd))
 
 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
-		struct timespec *end_time)
+		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
  	int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
 {
 	struct pollfd __user *ufds = restart_block->poll.ufds;
 	int nfds = restart_block->poll.nfds;
-	struct timespec *to = NULL, end_time;
+	struct timespec64 *to = NULL, end_time;
 	int ret;
 
 	if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		int, timeout_msecs)
 {
-	struct timespec end_time, *to = NULL;
+	struct timespec64 end_time, *to = NULL;
 	int ret;
 
 	if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		size_t, sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts, end_time, *to = NULL;
+	struct timespec ts;
+	struct timespec64 end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 9fb4f40d9a26..37b057b63b46 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
 extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 				 ktime_t *expires, unsigned long slack);
-extern u64 select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 
 static inline int poll_schedule(struct poll_wqueues *pwq, int state)
@@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
 
-extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
+extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
 extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-		       struct timespec *end_time);
+		       struct timespec64 *end_time);
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, struct timespec *end_time);
+			   fd_set __user *exp, struct timespec64 *end_time);
 
-extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
+				   long nsec);
 
 #endif /* _LINUX_POLL_H */
diff --git a/net/socket.c b/net/socket.c
index e7793f5601ae..a1bd16106625 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	struct mmsghdr __user *entry;
 	struct compat_mmsghdr __user *compat_entry;
 	struct msghdr msg_sys;
-	struct timespec end_time;
+	struct timespec64 end_time;
+	struct timespec64 timeout64;
 
 	if (timeout &&
 	    poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			flags |= MSG_DONTWAIT;
 
 		if (timeout) {
-			ktime_get_ts(timeout);
-			*timeout = timespec_sub(end_time, *timeout);
+			ktime_get_ts64(&timeout64);
+			*timeout = timespec64_to_timespec(
+					timespec64_sub(end_time, timeout64));
 			if (timeout->tv_sec < 0) {
 				timeout->tv_sec = timeout->tv_nsec = 0;
 				break;
-- 
cgit v1.2.3


From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:08 -0700
Subject: time: remove timespec_add_safe()

All references to timespec_add_safe() now use timespec64_add_safe().

The plan is to replace struct timespec references with struct timespec64
throughout the kernel as timespec is not y2038 safe.

Drop timespec_add_safe() and use timespec64_add_safe() for all
architectures.

Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h | 15 +++++++--------
 kernel/time/time.c     |  4 ----
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 1778937221bf..7e5d2fa9ac46 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
-# define timespec64_add_safe		timespec_add_safe
 # define timespec64_add			timespec_add
 # define timespec64_sub			timespec_sub
 # define timespec64_valid		timespec_valid
@@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
 
 extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
 
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME64_MAX in case of overflow.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
-					 const struct timespec64 rhs);
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
@@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 
 #endif
 
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+					 const struct timespec64 rhs);
+
 #endif /* _LINUX_TIME64_H */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index cb1f83eb5599..667b9335f5d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 	return res;
 }
 
-#if __BITS_PER_LONG != 64
-
 /*
  * Add two timespec64 values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0).
@@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 
 	return res;
 }
-
-#endif
-- 
cgit v1.2.3


From b1e4d9d82df8ab9097f80aa208c40eab6fc29858 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:20 -0700
Subject: debugobjects: make fixup functions return bool instead of int

I am going to introduce debugobjects infrastructure to USB subsystem.
But before this, I found the code of debugobjects could be improved.
This patchset will make fixup functions return bool type instead of int.
Because fixup only need report success or no.  boolean is the 'real'
type.

This patch (of 7):

The object debugging infrastructure core provides some fixup callbacks
for the subsystem who use it.  These callbacks are called from the debug
code whenever a problem in debug_object_init is detected.  And
debugobjects core suppose them returns 1 when the fixup was successful,
otherwise 0.  So the return type is boolean.

A bad thing is that debug_object_fixup use the return value for
arithmetic operation.  It confused me that what is the reall return
type.

Reading over the whole code, I found some place do use the return value
incorrectly(see next patch).  So why use bool type instead?

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h | 15 ++++++++-------
 lib/debugobjects.c           | 43 +++++++++++++++++++++----------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 98ffcbd4888e..a899f10c9365 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -39,7 +39,8 @@ struct debug_obj {
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
  * @fixup_init:		fixup function, which is called when the init check
- *			fails
+ *			fails. All fixup functions must return true if fixup
+ *			was successful, otherwise return false
  * @fixup_activate:	fixup function, which is called when the activate check
  *			fails
  * @fixup_destroy:	fixup function, which is called when the destroy check
@@ -51,12 +52,12 @@ struct debug_obj {
  */
 struct debug_obj_descr {
 	const char		*name;
-	void *(*debug_hint)	(void *addr);
-	int (*fixup_init)	(void *addr, enum debug_obj_state state);
-	int (*fixup_activate)	(void *addr, enum debug_obj_state state);
-	int (*fixup_destroy)	(void *addr, enum debug_obj_state state);
-	int (*fixup_free)	(void *addr, enum debug_obj_state state);
-	int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
+	void *(*debug_hint)(void *addr);
+	bool (*fixup_init)(void *addr, enum debug_obj_state state);
+	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
+	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
+	bool (*fixup_free)(void *addr, enum debug_obj_state state);
+	bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
 };
 
 #ifdef CONFIG_DEBUG_OBJECTS
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 519b5a10fd70..a9cee165cf25 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
  * Try to repair the damage, so we have a better chance to get useful
  * debug output.
  */
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
 		   void * addr, enum debug_obj_state state)
 {
-	int fixed = 0;
-
-	if (fixup)
-		fixed = fixup(addr, state);
-	debug_objects_fixups += fixed;
-	return fixed;
+	if (fixup && fixup(addr, state)) {
+		debug_objects_fixups++;
+		return true;
+	}
+	return false;
 }
 
 static void debug_object_is_on_stack(void *addr, int onstack)
@@ -797,7 +796,7 @@ static __initdata struct debug_obj_descr descr_type_test;
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -805,9 +804,9 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_init(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -816,7 +815,7 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -825,17 +824,17 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
 		if (obj->static_init == 1) {
 			debug_object_init(obj, &descr_type_test);
 			debug_object_activate(obj, &descr_type_test);
-			return 0;
+			return false;
 		}
-		return 1;
+		return true;
 
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_activate(obj, &descr_type_test);
-		return 1;
+		return true;
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -843,7 +842,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_destroy is called when:
  * - an active object is destroyed
  */
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -851,9 +850,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_destroy(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -861,7 +860,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -869,9 +868,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_free(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:41 -0700
Subject: debugobjects: insulate non-fixup logic related to static obj from
 fixup callbacks

When activating a static object we need make sure that the object is
tracked in the object tracker.  If it is a non-static object then the
activation is illegal.

In previous implementation, each subsystem need take care of this in
their fixup callbacks.  Actually we can put it into debugobjects core.
Thus we can save duplicated code, and have *pure* fixup callbacks.

To achieve this, a new callback "is_static_object" is introduced to let
the type specific code decide whether a object is static or not.  If
yes, we take it into object tracker, otherwise give warning and invoke
fixup callback.

This change has paassed debugobjects selftest, and I also do some test
with all debugobjects supports enabled.

At last, I have a concern about the fixups that can it change the object
which is in incorrect state on fixup? Because the 'addr' may not point
to any valid object if a non-static object is not tracked.  Then Change
such object can overwrite someone's memory and cause unexpected
behaviour.  For example, the timer_fixup_activate bind timer to function
stub_timer.

Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com
[changbin.du@intel.com: improve code comments where invoke the new is_static_object callback]
  Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com
Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h |  2 ++
 kernel/rcu/update.c          | 26 +++--------------------
 kernel/time/hrtimer.c        |  7 +------
 kernel/time/timer.c          | 43 +++++++++++++-------------------------
 kernel/workqueue.c           | 42 ++++++++-----------------------------
 lib/debugobjects.c           | 49 +++++++++++++++++++++++++++++---------------
 6 files changed, 60 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index a899f10c9365..46056cb161fc 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,6 +38,7 @@ struct debug_obj {
  * @name:		name of the object typee
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
+ * @is_static_object	return true if the obj is static, otherwise return false
  * @fixup_init:		fixup function, which is called when the init check
  *			fails. All fixup functions must return true if fixup
  *			was successful, otherwise return false
@@ -53,6 +54,7 @@ struct debug_obj {
 struct debug_obj_descr {
 	const char		*name;
 	void *(*debug_hint)(void *addr);
+	bool (*is_static_object)(void *addr);
 	bool (*fixup_init)(void *addr, enum debug_obj_state state);
 	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
 	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a9df198eb22d..3e888cd5a594 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
 	debug_object_free(head, &rcuhead_debug_descr);
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
 {
-	struct rcu_head *head = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. We just make sure that it is
-		 * tracked in the object tracker.
-		 */
-		debug_object_init(head, &rcuhead_debug_descr);
-		debug_object_activate(head, &rcuhead_debug_descr);
-		return false;
-	default:
-		return true;
-	}
+	return true;
 }
 
 /**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
 
 struct debug_obj_descr rcuhead_debug_descr = {
 	.name = "rcu_head",
-	.fixup_activate = rcuhead_fixup_activate,
+	.is_static_object = rcuhead_is_static_object,
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f962a58c0957..8c7392c4fdbd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		WARN_ON_ONCE(1);
-		return false;
-
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index be33481a4da1..3a95f9728778 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr)
 	return ((struct timer_list *) addr)->function;
 }
 
+static bool timer_is_static_object(void *addr)
+{
+	struct timer_list *timer = addr;
+
+	return (timer->entry.pprev == NULL &&
+		timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -516,30 +524,16 @@ static void stub_timer(unsigned long data)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
 	switch (state) {
-
 	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The timer was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (timer->entry.pprev == NULL &&
-		    timer->entry.next == TIMER_ENTRY_STATIC) {
-			debug_object_init(timer, &timer_debug_descr);
-			debug_object_activate(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
-		return false;
+		setup_timer(timer, stub_timer, 0);
+		return true;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
@@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.next == TIMER_ENTRY_STATIC) {
-			/*
-			 * This is not really a fixup. The timer was
-			 * statically initialized. We just make sure that it
-			 * is tracked in the object tracker.
-			 */
-			debug_object_init(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
+		setup_timer(timer, stub_timer, 0);
+		return true;
 	default:
 		return false;
 	}
@@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr timer_debug_descr = {
 	.name			= "timer_list",
 	.debug_hint		= timer_debug_hint,
+	.is_static_object	= timer_is_static_object,
 	.fixup_init		= timer_fixup_init,
 	.fixup_activate		= timer_fixup_activate,
 	.fixup_free		= timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6751b18fd9ac..e1c0e996b5ae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr)
 	return ((struct work_struct *) addr)->func;
 }
 
+static bool work_is_static_object(void *addr)
+{
+	struct work_struct *work = addr;
+
+	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state)
 	}
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- */
-static bool work_fixup_activate(void *addr, enum debug_obj_state state)
-{
-	struct work_struct *work = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The work struct was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
-			debug_object_init(work, &work_debug_descr);
-			debug_object_activate(work, &work_debug_descr);
-			return false;
-		}
-		WARN_ON_ONCE(1);
-		return false;
-
-	case ODEBUG_STATE_ACTIVE:
-		WARN_ON(1);
-
-	default:
-		return false;
-	}
-}
-
 /*
  * fixup_free is called when:
  * - an active object is freed
@@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr work_debug_descr = {
 	.name		= "work_struct",
 	.debug_hint	= work_debug_hint,
+	.is_static_object = work_is_static_object,
 	.fixup_init	= work_fixup_init,
-	.fixup_activate	= work_fixup_activate,
 	.fixup_free	= work_fixup_free,
 };
 
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 2f07c8c697b8..a8e12601eb37 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -431,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 	/*
-	 * This happens when a static object is activated. We
-	 * let the type specific code decide whether this is
-	 * true or not.
+	 * We are here when a static object is activated. We
+	 * let the type specific code confirm whether this is
+	 * true or not. if true, we just make sure that the
+	 * static object is tracked in the object tracker. If
+	 * not, this must be a bug, so we try to fix it up.
 	 */
-	if (debug_object_fixup(descr->fixup_activate, addr,
-			   ODEBUG_STATE_NOTAVAILABLE)) {
+	if (descr->is_static_object && descr->is_static_object(addr)) {
+		/* track this static object */
+		debug_object_init(addr, descr);
+		debug_object_activate(addr, descr);
+	} else {
 		debug_print_object(&o, "activate");
-		return -EINVAL;
+		ret = debug_object_fixup(descr->fixup_activate, addr,
+					ODEBUG_STATE_NOTAVAILABLE);
+		return ret ? 0 : -EINVAL;
 	}
 	return 0;
 }
@@ -602,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 
 		raw_spin_unlock_irqrestore(&db->lock, flags);
 		/*
-		 * Maybe the object is static.  Let the type specific
-		 * code decide what to do.
+		 * Maybe the object is static, and we let the type specific
+		 * code confirm. Track this static object if true, else invoke
+		 * fixup.
 		 */
-		if (debug_object_fixup(descr->fixup_assert_init, addr,
-				       ODEBUG_STATE_NOTAVAILABLE))
+		if (descr->is_static_object && descr->is_static_object(addr)) {
+			/* Track this static object */
+			debug_object_init(addr, descr);
+		} else {
 			debug_print_object(&o, "assert_init");
+			debug_object_fixup(descr->fixup_assert_init, addr,
+					   ODEBUG_STATE_NOTAVAILABLE);
+		}
 		return;
 	}
 
@@ -792,6 +805,13 @@ struct self_test {
 
 static __initdata struct debug_obj_descr descr_type_test;
 
+static bool __init is_static_object(void *addr)
+{
+	struct self_test *obj = addr;
+
+	return obj->static_init;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -813,7 +833,7 @@ static bool __init fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 {
@@ -821,13 +841,7 @@ static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (obj->static_init == 1) {
-			debug_object_init(obj, &descr_type_test);
-			debug_object_activate(obj, &descr_type_test);
-			return false;
-		}
 		return true;
-
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_activate(obj, &descr_type_test);
@@ -916,6 +930,7 @@ out:
 
 static __initdata struct debug_obj_descr descr_type_test = {
 	.name			= "selftest",
+	.is_static_object	= is_static_object,
 	.fixup_init		= fixup_init,
 	.fixup_activate		= fixup_activate,
 	.fixup_destroy		= fixup_destroy,
-- 
cgit v1.2.3


From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Thu, 19 May 2016 17:09:56 -0700
Subject: kernel/padata.c: removed unused code

By accident I stumbled across code that has never been used.  This
driver has EXPORT_SYMBOL functions, and the only user of the code is
pcrypt.c, but this only uses a subset of the exported symbols.

According to 'git log -G', the functions, padata_set_cpumasks,
padata_add_cpu, and padata_remove_cpu have never been used since they
were first introduced.  This patch removes the unused code.

On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than
4k smaller.

  kbuild_hp> size $KBUILD_OUTPUT/vmlinux
      text    data     bss      dec hex    filename
  10566658 4678360 1122304 16367322 f9beda vmlinux
  10561984 4678360 1122304 16362648 f9ac98 vmlinux

On another config, 32 bit, the saving is about 0.5k bytes.

  kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux
  6012005 2409513 2785280 11206798 ab008e vmlinux
  6011491 2409513 2785280 11206284 aafe8c vmlinux

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/padata.h |  5 ----
 kernel/padata.c        | 64 --------------------------------------------------
 2 files changed, 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 438694650471..113ee626a4dc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
 extern void padata_do_serial(struct padata_priv *padata);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
-			       cpumask_var_t pcpumask,
-			       cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
 extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..67ddd4acde9d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -606,33 +606,6 @@ out_replace:
 	return 0;
 }
 
-/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- *                       one is used by parallel workers and the second one
- *                       by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
-			cpumask_var_t cbcpumask)
-{
-	int err;
-
-	mutex_lock(&pinst->lock);
-	get_online_cpus();
-
-	err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
-	put_online_cpus();
-	mutex_unlock(&pinst->lock);
-
-	return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
 /**
  * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
  *                     equivalent to @cpumask.
@@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 	return 0;
 }
 
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- *                  padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- *        The @mask may be any combination of the following flags:
- *          PADATA_CPU_SERIAL   - serial cpumask
- *          PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
-	int err;
-
-	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
-		return -EINVAL;
-
-	mutex_lock(&pinst->lock);
-
-	get_online_cpus();
-	if (mask & PADATA_CPU_SERIAL)
-		cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
-	if (mask & PADATA_CPU_PARALLEL)
-		cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
-	err = __padata_add_cpu(pinst, cpu);
-	put_online_cpus();
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
 static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd = NULL;
@@ -1091,7 +1028,6 @@ err_free_inst:
 err:
 	return NULL;
 }
-EXPORT_SYMBOL(padata_alloc);
 
 /**
  * padata_free - free a padata instance
-- 
cgit v1.2.3


From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 19 May 2016 17:10:37 -0700
Subject: mm: SLAB freelist randomization

Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize
the SLAB freelist.  The list is randomized during initialization of a
new set of pages.  The order on different freelist sizes is pre-computed
at boot for performance.  Each kmem_cache has its own randomized
freelist.  Before pre-computed lists are available freelists are
generated dynamically.  This security feature reduces the predictability
of the kernel SLAB allocator against heap overflows rendering attacks
much less stable.

For example this attack against SLUB (also applicable against SLAB)
would be affected:

  https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/

Also, since v4.6 the freelist was moved at the end of the SLAB.  It
means a controllable heap is opened to new attacks not yet publicly
discussed.  A kernel heap overflow can be transformed to multiple
use-after-free.  This feature makes this type of attack harder too.

To generate entropy, we use get_random_bytes_arch because 0 bits of
entropy is available in the boot stage.  In the worse case this function
will fallback to the get_random_bytes sub API.  We also generate a shift
random number to shift pre-computed freelist for each new set of pages.

The config option name is not specific to the SLAB as this approach will
be extended to other allocators like SLUB.

Performance results highlighted no major changes:

Hackbench (running 90 10 times):

  Before average: 0.0698
  After average: 0.0663 (-5.01%)

slab_test 1 run on boot.  Difference only seen on the 2048 size test
being the worse case scenario covered by freelist randomization.  New
slab pages are constantly being created on the 10000 allocations.
Variance should be mainly due to getting new pages every few
allocations.

Before:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles
  10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles
  10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles
  10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles
  10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles
  10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles
  10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles
  10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles
  10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles
  10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles
  10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles
  10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 121 cycles
  10000 times kmalloc(64)/kfree -> 121 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 121 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

After:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles
  10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles
  10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles
  10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles
  10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles
  10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles
  10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles
  10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles
  10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles
  10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles
  10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles
  10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 123 cycles
  10000 times kmalloc(64)/kfree -> 142 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 119 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

[akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()]
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h |   4 ++
 init/Kconfig             |   9 +++
 mm/slab.c                | 167 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 178 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9edbbf352340..8694f7a5d92b 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
 	struct kasan_cache kasan_info;
 #endif
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+	void *random_seq;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index 0dfd09d54c65..79a91a2c0444 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1742,6 +1742,15 @@ config SLOB
 
 endchoice
 
+config SLAB_FREELIST_RANDOM
+	default n
+	depends on SLAB
+	bool "SLAB freelist randomization"
+	help
+	  Randomizes the freelist order used on creating new SLABs. This
+	  security feature reduces the predictability of the kernel slab
+	  allocator against heap overflows.
+
 config SLUB_CPU_PARTIAL
 	default y
 	depends on SLUB && SMP
diff --git a/mm/slab.c b/mm/slab.c
index 8133ebea77a4..1ee26a0d358f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1243,6 +1243,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
 	}
 }
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+			size_t count)
+{
+	size_t i;
+	unsigned int rand;
+
+	for (i = 0; i < count; i++)
+		list[i] = i;
+
+	/* Fisher-Yates shuffle */
+	for (i = count - 1; i > 0; i--) {
+		rand = prandom_u32_state(state);
+		rand %= (i + 1);
+		swap(list[i], list[rand]);
+	}
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+	unsigned int seed, count = cachep->num;
+	struct rnd_state state;
+
+	if (count < 2)
+		return 0;
+
+	/* If it fails, we will just use the global lists */
+	cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+	if (!cachep->random_seq)
+		return -ENOMEM;
+
+	/* Get best entropy at this stage */
+	get_random_bytes_arch(&seed, sizeof(seed));
+	prandom_seed_state(&state, seed);
+
+	freelist_randomize(&state, cachep->random_seq, count);
+	return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+	kfree(cachep->random_seq);
+	cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+	return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
 /*
  * Initialisation.  Called after the page allocator have been initialised and
  * before smp_init().
@@ -2374,6 +2429,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
 	int i;
 	struct kmem_cache_node *n;
 
+	cache_random_seq_destroy(cachep);
+
 	free_percpu(cachep->cpu_cache);
 
 	/* NUMA: free the node structures */
@@ -2480,15 +2537,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 #endif
 }
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+	struct {
+		unsigned int pos;
+		freelist_idx_t *list;
+		unsigned int count;
+		unsigned int rand;
+	};
+	struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+				struct kmem_cache *cachep,
+				unsigned int count)
+{
+	bool ret;
+	unsigned int rand;
+
+	/* Use best entropy available to define a random shift */
+	get_random_bytes_arch(&rand, sizeof(rand));
+
+	/* Use a random state if the pre-computed list is not available */
+	if (!cachep->random_seq) {
+		prandom_seed_state(&state->rnd_state, rand);
+		ret = false;
+	} else {
+		state->list = cachep->random_seq;
+		state->count = count;
+		state->pos = 0;
+		state->rand = rand;
+		ret = true;
+	}
+	return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+	return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+	unsigned int objfreelist = 0, i, count = cachep->num;
+	union freelist_init_state state;
+	bool precomputed;
+
+	if (count < 2)
+		return false;
+
+	precomputed = freelist_state_initialize(&state, cachep, count);
+
+	/* Take a random entry as the objfreelist */
+	if (OBJFREELIST_SLAB(cachep)) {
+		if (!precomputed)
+			objfreelist = count - 1;
+		else
+			objfreelist = next_random_slot(&state);
+		page->freelist = index_to_obj(cachep, page, objfreelist) +
+						obj_offset(cachep);
+		count--;
+	}
+
+	/*
+	 * On early boot, generate the list dynamically.
+	 * Later use a pre-computed list for speed.
+	 */
+	if (!precomputed) {
+		freelist_randomize(&state.rnd_state, page->freelist, count);
+	} else {
+		for (i = 0; i < count; i++)
+			set_free_obj(page, i, next_random_slot(&state));
+	}
+
+	if (OBJFREELIST_SLAB(cachep))
+		set_free_obj(page, cachep->num - 1, objfreelist);
+
+	return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+				struct page *page)
+{
+	return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
 static void cache_init_objs(struct kmem_cache *cachep,
 			    struct page *page)
 {
 	int i;
 	void *objp;
+	bool shuffled;
 
 	cache_init_objs_debug(cachep, page);
 
-	if (OBJFREELIST_SLAB(cachep)) {
+	/* Try to randomize the freelist if enabled */
+	shuffled = shuffle_freelist(cachep, page);
+
+	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
 		page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
 						obj_offset(cachep);
 	}
@@ -2502,7 +2659,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			kasan_poison_object_data(cachep, objp);
 		}
 
-		set_free_obj(page, i, i);
+		if (!shuffled)
+			set_free_obj(page, i, i);
 	}
 }
 
@@ -3841,6 +3999,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	int shared = 0;
 	int batchcount = 0;
 
+	err = cache_random_seq_create(cachep, gfp);
+	if (err)
+		goto end;
+
 	if (!is_root_cache(cachep)) {
 		struct kmem_cache *root = memcg_root_cache(cachep);
 		limit = root->limit;
@@ -3894,6 +4056,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	batchcount = (limit + 1) / 2;
 skip_setup:
 	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
 	if (err)
 		pr_err("enable_cpucache failed for %s, error %d\n",
 		       cachep->name, -err);
-- 
cgit v1.2.3


From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:49 -0700
Subject: mm: rename _count, field of the struct page, to _refcount

Many developers already know that field for reference count of the
struct page is _count and atomic type.  They would try to handle it
directly and this could break the purpose of page reference count
tracepoint.  To prevent direct _count modification, this patch rename it
to _refcount and add warning message on the code.  After that, developer
who need to handle reference count will find that field should not be
accessed directly.

[akpm@linux-foundation.org: fix comments, per Vlastimil]
[akpm@linux-foundation.org: Documentation/vm/transhuge.txt too]
[sfr@canb.auug.org.au: sync ethernet driver changes]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Manish Chopra <manish.chopra@qlogic.com>
Cc: Yuval Mintz <yuval.mintz@qlogic.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt                  | 10 +++++-----
 arch/tile/mm/init.c                             |  2 +-
 drivers/block/aoe/aoecmd.c                      |  2 +-
 drivers/hwtracing/intel_th/msu.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++----------
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  4 ++--
 fs/proc/page.c                                  |  2 +-
 include/linux/mm.h                              |  2 +-
 include/linux/mm_types.h                        | 14 ++++++++-----
 include/linux/page_ref.h                        | 26 ++++++++++++-------------
 include/linux/pagemap.h                         |  8 ++++----
 kernel/kexec_core.c                             |  2 +-
 mm/huge_memory.c                                |  4 ++--
 mm/internal.h                                   |  2 +-
 mm/page_alloc.c                                 |  4 ++--
 mm/slub.c                                       |  4 ++--
 mm/vmscan.c                                     |  4 ++--
 17 files changed, 58 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index d9cb65cf5cfd..fb0e1f2a19cc 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
 Refcounting on THP is mostly consistent with refcounting on other compound
 pages:
 
-  - get_page()/put_page() and GUP operate in head page's ->_count.
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
 
-  - ->_count in tail pages is always zero: get_page_unless_zero() never
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
     succeed on tail pages.
 
   - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to
 sum of mapcount of all sub-pages plus one (split_huge_page caller must
 have reference for head page).
 
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
 page->_mapcount.
 
 We safe against physical memory scanners too: the only legitimate way
 scanner can get reference to a page is get_page_unless_zero().
 
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
+All tail pages has zero ->_refcount until atomic_add(). It prevent scanner
 from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value.  We already known how many references
+we don't care about ->_refcount value.  We already known how many references
 with should uncharge from head page.
 
 For head page get_page_unless_zero() will succeed and we don't mind. It's
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index a0582b7f41d3..adce25462b0d 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 			 * Hacky direct set to avoid unnecessary
 			 * lock take/release for EVERY page here.
 			 */
-			p->_count.counter = 0;
+			p->_refcount.counter = 0;
 			p->_mapcount.counter = -1;
 		}
 		init_page_count(page);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 437b3a822f44..d597e432e195 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
  * discussion.
  *
  * We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition.  So we use _count directly.
+ * positive page count as a precondition.  So we use _refcount directly.
  */
 static void
 bio_pageinc(struct bio *bio)
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index d9d6022c5aca..d2209147dc89 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
 		return;
 
-	/* drop page _counts */
+	/* drop page _refcounts */
 	for (pg = 0; pg < msc->nr_pages; pg++) {
 		struct page *page = msc_buffer_get_page(msc, pg);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f3456798c596..bd947704b59c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
 			goto err_unmap;
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_add(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -452,8 +452,8 @@ err_unmap:
 	while (--i >= 0) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	 */
 	split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->dma_info.page[i]._count);
+		page_ref_add(&wi->dma_info.page[i],
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
 		       PCI_DMA_FROMDEVICE);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->dma_info.page[i]._count);
+		page_ref_sub(&wi->dma_info.page[i],
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(&wi->dma_info.page[i]);
 	}
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3aabfc0adefe..73dd525fbf08 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 		/* Incr page ref count to reuse on allocation failure
 		 * so that it doesn't get freed while freeing SKB.
 		 */
-		atomic_inc(&current_bd->data->_count);
+		page_ref_inc(current_bd->data);
 		goto out;
 	}
 
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 				 * freeing SKB.
 				 */
 
-				atomic_inc(&sw_rx_data->data->_count);
+				page_ref_inc(sw_rx_data->data);
 				rxq->rx_alloc_errors++;
 				qede_recycle_rx_bd_ring(rxq, edev,
 							fp_cqe->bd_num);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992cc..3ecd445e830d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
 
 
 	/*
-	 * Caveats on high order pages: page->_count will only be set
+	 * Caveats on high order pages: page->_refcount will only be set
 	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
 	 * SLOB won't set PG_slab at all on compound pages.
 	 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f799757ab..1193a54ea2b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
 	page = compound_head(page);
 	/*
 	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_count.
+	 * requires to already have an elevated page->_refcount.
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4fa86c..1fda9c99ef95 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
 			unsigned long counters;
 #else
 			/*
-			 * Keep _count separate from slub cmpxchg_double data.
-			 * As the rest of the double word is protected by
-			 * slab_lock but _count is not.
+			 * Keep _refcount separate from slub cmpxchg_double
+			 * data.  As the rest of the double word is protected by
+			 * slab_lock but _refcount is not.
 			 */
 			unsigned counters;
 #endif
@@ -97,7 +97,11 @@ struct page {
 					};
 					int units;	/* SLOB */
 				};
-				atomic_t _count;		/* Usage count, see below. */
+				/*
+				 * Usage count, *USE WRAPPER FUNCTION*
+				 * when manual accounting. See page_ref.h
+				 */
+				atomic_t _refcount;
 			};
 			unsigned int active;	/* SLAB */
 		};
@@ -248,7 +252,7 @@ struct page_frag_cache {
 	__u32 offset;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
+	 * containing page->_refcount every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
 	bool pfmemalloc;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9540e..8b5e0a9f2431 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
 
 static inline int page_ref_count(struct page *page)
 {
-	return atomic_read(&page->_count);
+	return atomic_read(&page->_refcount);
 }
 
 static inline int page_count(struct page *page)
 {
-	return atomic_read(&compound_head(page)->_count);
+	return atomic_read(&compound_head(page)->_refcount);
 }
 
 static inline void set_page_count(struct page *page, int v)
 {
-	atomic_set(&page->_count, v);
+	atomic_set(&page->_refcount, v);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
 		__page_ref_set(page, v);
 }
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
 
 static inline void page_ref_add(struct page *page, int nr)
 {
-	atomic_add(nr, &page->_count);
+	atomic_add(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, nr);
 }
 
 static inline void page_ref_sub(struct page *page, int nr)
 {
-	atomic_sub(nr, &page->_count);
+	atomic_sub(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -nr);
 }
 
 static inline void page_ref_inc(struct page *page)
 {
-	atomic_inc(&page->_count);
+	atomic_inc(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, 1);
 }
 
 static inline void page_ref_dec(struct page *page)
 {
-	atomic_dec(&page->_count);
+	atomic_dec(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -1);
 }
 
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
-	int ret = atomic_sub_and_test(nr, &page->_count);
+	int ret = atomic_sub_and_test(nr, &page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 
 static inline int page_ref_dec_and_test(struct page *page)
 {
-	int ret = atomic_dec_and_test(&page->_count);
+	int ret = atomic_dec_and_test(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
 
 static inline int page_ref_dec_return(struct page *page)
 {
-	int ret = atomic_dec_return(&page->_count);
+	int ret = atomic_dec_return(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
 		__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
 
 static inline int page_ref_add_unless(struct page *page, int nr, int u)
 {
-	int ret = atomic_add_unless(&page->_count, nr, u);
+	int ret = atomic_add_unless(&page->_refcount, nr, u);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
 		__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
 
 static inline int page_ref_freeze(struct page *page, int count)
 {
-	int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
 		__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
 	VM_BUG_ON_PAGE(page_count(page) != 0, page);
 	VM_BUG_ON(count == 0);
 
-	atomic_set(&page->_count, count);
+	atomic_set(&page->_refcount, count);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
 		__page_ref_unfreeze(page, count);
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab155c67c..fe1513ffb7bf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
 
 /*
  * speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
  *
  * This function must be called inside the same rcu_read_lock() section as has
  * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
  *
  * Unless an RCU grace period has passed, the count of all pages coming out
  * of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
  * 2. conditionally increment refcount
  * 3. check the page is still in pagecache (if no, goto 1)
  *
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
  * following (with tree_lock held for write):
  * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
  * B. remove page from pagecache
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..1c03dfb4abfd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_STRUCT_SIZE(list_head);
 	VMCOREINFO_SIZE(nodemask_t);
 	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, _refcount);
 	VMCOREINFO_OFFSET(page, mapping);
 	VMCOREINFO_OFFSET(page, lru);
 	VMCOREINFO_OFFSET(page, _mapcount);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b49ee126d4d1..f8ac8f582fd8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
 
 	/*
-	 * tail_page->_count is zero and not changing from under us. But
+	 * tail_page->_refcount is zero and not changing from under us. But
 	 * get_page_unless_zero() may be running from under us on the
 	 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
 	 * would then run atomic_set() concurrently with
@@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (mlocked)
 		lru_add_drain();
 
-	/* Prevent deferred_split_scan() touching ->_count */
+	/* Prevent deferred_split_scan() touching ->_refcount */
 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 	count = page_count(head);
 	mapcount = total_mapcount(head);
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6721cf..098a89e3b97c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
 }
 
 /*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
  * a count of one.
  */
 static inline void set_page_refcounted(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069efcc4d7..4ce57f938b7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
-		bad_reason = "nonzero _count";
+		bad_reason = "nonzero _refcount";
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
-		 * because their page->_count is zero at all time.
+		 * because their page->_refcount is zero at all time.
 		 */
 		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
diff --git a/mm/slub.c b/mm/slub.c
index 8671de2e5b12..cf1faa4d3992 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
 	tmp.counters = counters_new;
 	/*
 	 * page->counters can cover frozen/inuse/objects as well
-	 * as page->_count.  If we assign to ->counters directly
-	 * we run the risk of losing updates to page->_count, so
+	 * as page->_refcount.  If we assign to ->counters directly
+	 * we run the risk of losing updates to page->_refcount, so
 	 * be careful and only assign to the fields we need.
 	 */
 	page->frozen  = tmp.frozen;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61f4822..d3a02ac3eed7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
-	 * load is not satisfied before that of page->_count.
+	 * load is not satisfied before that of page->_refcount.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
@@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
  * But we had to alter page->flags anyway.
  */
 
-- 
cgit v1.2.3


From d64e85d3e1c59c3664b9ec1183052ec4641ea1e2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 19 May 2016 17:10:52 -0700
Subject: compiler.h: add support for malloc attribute

gcc as far back as at least 3.04 documents the function attribute
__malloc__.  Add a shorthand for attaching that to a function
declaration.  This was also suggested by Andi Kleen way back in 2002
[1], but didn't get applied, perhaps because gcc at that time generated
the exact same code with and without this attribute.

This attribute tells the compiler that the return value (if non-NULL)
can be assumed not to alias any other valid pointers at the time of the
call.

Please note that the documentation for a range of gcc versions (starting
from around 4.7) contained a somewhat confusing and self-contradicting
text:

  The malloc attribute is used to tell the compiler that a function may
  be treated as if any non-NULL pointer it returns cannot alias any other
  pointer valid when the function returns and *that the memory has
  undefined content*.  [...] Standard functions with this property include
  malloc and *calloc*.

(emphasis mine). The intended meaning has later been clarified [2]:

  This tells the compiler that a function is malloc-like, i.e., that the
  pointer P returned by the function cannot alias any other pointer valid
  when the function returns, and moreover no pointers to valid objects
  occur in any storage addressed by P.

What this means is that we can apply the attribute to kmalloc and
friends, and it is ok for the returned memory to have well-defined
contents (__GFP_ZERO).  But it is not ok to apply it to kmemdup(), nor
to other functions which both allocate and possibly initialize the
memory with existing pointers.  So unless someone is doing something
pretty perverted kstrdup() should also be a fine candidate.

[1] http://thread.gmane.org/gmane.linux.kernel/57172
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56955

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 include/linux/compiler.h     | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 3d5202eda22f..e2949397c19b 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,6 +142,7 @@
 
 #if GCC_VERSION >= 30400
 #define __must_check		__attribute__((warn_unused_result))
+#define __malloc		__attribute__((__malloc__))
 #endif
 
 #if GCC_VERSION >= 40000
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..793c0829e3a3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __deprecated_for_modules
 #endif
 
+#ifndef __malloc
+#define __malloc
+#endif
+
 /*
  * Allow us to avoid 'defined but not used' warnings on functions and data,
  * as well as force them to be emitted to the assembly file.
-- 
cgit v1.2.3


From 48a270554a3251681ae11173f2fd6389d943e183 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 19 May 2016 17:10:55 -0700
Subject: include/linux: apply __malloc attribute

Attach the malloc attribute to a few allocation functions.  This helps
gcc generate better code by telling it that the return value doesn't
alias any existing pointers (which is even more valuable given the
pessimizations implied by -fno-strict-aliasing).

A simple example of what this allows gcc to do can be seen by looking at
the last part of drm_atomic_helper_plane_reset:

	plane->state = kzalloc(sizeof(*plane->state), GFP_KERNEL);

	if (plane->state) {
		plane->state->plane = plane;
		plane->state->rotation = BIT(DRM_ROTATE_0);
	}

which compiles to

    e8 99 bf d6 ff          callq  ffffffff8116d540 <kmem_cache_alloc_trace>
    48 85 c0                test   %rax,%rax
    48 89 83 40 02 00 00    mov    %rax,0x240(%rbx)
    74 11                   je     ffffffff814015c4 <drm_atomic_helper_plane_reset+0x64>
    48 89 18                mov    %rbx,(%rax)
    48 8b 83 40 02 00 00    mov    0x240(%rbx),%rax [*]
    c7 40 40 01 00 00 00    movl   $0x1,0x40(%rax)

With this patch applied, the instruction at [*] is elided, since the
store to plane->state->plane is known to not alter the value of
plane->state.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 16 ++++++++--------
 include/linux/device.h  | 12 ++++++------
 include/linux/kernel.h  |  4 ++--
 include/linux/mempool.h |  3 ++-
 include/linux/slab.h    | 16 ++++++++--------
 include/linux/string.h  |  2 +-
 6 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 35b22f94d2d2..f9be32691718 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
-				     unsigned long goal);
+				     unsigned long goal) __malloc;
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 void *__alloc_bootmem_node_high(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit) __malloc;
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
-				 unsigned long goal);
+				 unsigned long goal) __malloc;
 void *__alloc_bootmem_low_nopanic(unsigned long size,
 				 unsigned long align,
-				 unsigned long goal);
+				 unsigned long goal) __malloc;
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
-				      unsigned long goal);
+				      unsigned long goal) __malloc;
 
 #ifdef CONFIG_NO_BOOTMEM
 /* We are using top down, so it is safe to use 0 here */
diff --git a/include/linux/device.h b/include/linux/device.h
index b130304f9b1b..ca90ad8bcd61 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
 
 #ifdef CONFIG_DEBUG_DEVRES
 extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-				 int nid, const char *name);
+				 int nid, const char *name) __malloc;
 #define devres_alloc(release, size, gfp) \
 	__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
 #define devres_alloc_node(release, size, gfp, nid) \
 	__devres_alloc_node(release, size, gfp, nid, #release)
 #else
 extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-			       int nid);
+			       int nid) __malloc;
 static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
 {
 	return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
@@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id);
 extern int devres_release_group(struct device *dev, void *id);
 
 /* managed devm_k.alloc/kfree for device drivers */
-extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
+extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
 extern __printf(3, 0)
 char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
-		      va_list ap);
+		      va_list ap) __malloc;
 extern __printf(3, 4)
-char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
+char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc;
 static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
 {
 	return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
@@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev,
 	return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
 }
 extern void devm_kfree(struct device *dev, void *p);
-extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
 extern void *devm_kmemdup(struct device *dev, const void *src, size_t len,
 			  gfp_t gfp);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f7775e229b0..cc7398287fdd 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,9 +412,9 @@ extern __printf(3, 4)
 int scnprintf(char *buf, size_t size, const char *fmt, ...);
 extern __printf(3, 0)
 int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
-extern __printf(2, 3)
+extern __printf(2, 3) __malloc
 char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern __printf(2, 0)
+extern __printf(2, 0) __malloc
 char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
 extern __printf(2, 0)
 const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951e8fd2..b1086c936507 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,7 @@
 #define _LINUX_MEMPOOL_H
 
 #include <linux/wait.h>
+#include <linux/compiler.h>
 
 struct kmem_cache;
 
@@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 508bd827e6dc..aeb3e6d00a66 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size)
 }
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
 void kmem_cache_free(struct kmem_cache *, void *);
 
 /*
@@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 }
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
 #else
 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
@@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 					   gfp_t gfpflags,
-					   int node, size_t size) __assume_slab_alignment;
+					   int node, size_t size) __assume_slab_alignment __malloc;
 #else
 static __always_inline void *
 kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
 #else
 static __always_inline void *
 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
diff --git a/include/linux/string.h b/include/linux/string.h
index d3993a79a325..26b6f6a66f83 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new);
 
 extern void kfree_const(const void *x);
 
-extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
 extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
-- 
cgit v1.2.3


From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:10:58 -0700
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

	node = next_node(node, XXX);
	if (node == MAX_NUMNODES)
		node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

[mhocko@suse.com: use next_node_in() helper]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Hui Zhu <zhuhui@xiaomi.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/kernel/setup.c |  4 +---
 arch/x86/mm/numa.c       |  4 +---
 include/linux/nodemask.h | 11 ++++++++++-
 kernel/cpuset.c          |  8 +-------
 lib/Makefile             |  2 +-
 lib/nodemask.c           | 30 ++++++++++++++++++++++++++++++
 mm/hugetlb.c             |  4 +---
 mm/memcontrol.c          |  4 +---
 mm/mempolicy.c           | 24 ++----------------------
 mm/page_isolation.c      |  9 +++------
 mm/slab.c                | 13 +++----------
 11 files changed, 54 insertions(+), 59 deletions(-)
 create mode 100644 lib/nodemask.c

(limited to 'include/linux')

diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index a992238e9b58..153020abd2f5 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
 		cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
 		cpu_2_node[best_cpu] = node;
 		cpumask_clear_cpu(best_cpu, &unbound_cpus);
-		node = next_node(node, default_nodes);
-		if (node == MAX_NUMNODES)
-			node = first_node(default_nodes);
+		node = next_node_in(node, default_nodes);
 	}
 
 	/* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f70c1ff46125..9c086c57105c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
 		if (early_cpu_to_node(i) != NUMA_NO_NODE)
 			continue;
 		numa_set_node(i, rr);
-		rr = next_node(rr, node_online_map);
-		if (rr == MAX_NUMNODES)
-			rr = first_node(node_online_map);
+		rr = next_node_in(rr, node_online_map);
 	}
 }
 
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889cf9ab..f746e44d4046 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
  *
  * int first_node(mask)			Number lowest set bit, or MAX_NUMNODES
  * int next_node(node, mask)		Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask)		Next node past 'node', or wrap to first,
+ *					or MAX_NUMNODES
  * int first_unset_node(mask)		First node not set in mask, or 
- *					MAX_NUMNODES.
+ *					MAX_NUMNODES
  *
  * nodemask_t nodemask_of_node(node)	Return nodemask with bit 'node' set
  * NODE_MASK_ALL			Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
 	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
 
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
 	nodes_clear(*mask);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..611cc69af8f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 
 static int cpuset_spread_node(int *rotor)
 {
-	int node;
-
-	node = next_node(*rotor, current->mems_allowed);
-	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
-	*rotor = node;
-	return node;
+	return *rotor = next_node_in(*rotor, current->mems_allowed);
 }
 
 int cpuset_mem_spread_node(void)
diff --git a/lib/Makefile b/lib/Makefile
index 931396ada5eb..42b69185f963 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000000..e42a5bf44d33
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+	int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..5856093f9062 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -937,9 +937,7 @@ err:
  */
 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 {
-	nid = next_node(nid, *nodes_allowed);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(*nodes_allowed);
+	nid = next_node_in(nid, *nodes_allowed);
 	VM_BUG_ON(nid >= MAX_NUMNODES);
 
 	return nid;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5c41bd..6740c4c2b550 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 
-	node = next_node(node, memcg->scan_nodes);
-	if (node == MAX_NUMNODES)
-		node = first_node(memcg->scan_nodes);
+	node = next_node_in(node, memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01bc950a..8d369cee0cd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
-#include <linux/random.h>
 
 #include "internal.h"
 
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		BUG();
 
 	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = first_node(tmp);
+		current->il_next = next_node_in(current->il_next, tmp);
 		if (current->il_next >= MAX_NUMNODES)
 			current->il_next = numa_node_id();
 	}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+	next = next_node_in(nid, policy->v.nodes);
 	if (next < MAX_NUMNODES)
 		me->il_next = next;
 	return nid;
@@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		return interleave_nodes(pol);
 }
 
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
-	int w, bit = NUMA_NO_NODE;
-
-	w = nodes_weight(*maskp);
-	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
-	return bit;
-}
-
 #ifdef CONFIG_HUGETLBFS
 /*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f568206544..67bedd18429c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
 	 * accordance with memory policy of the user process if possible. For
 	 * now as a simple work-around, we use the next node for destination.
 	 */
-	if (PageHuge(page)) {
-		int node = next_online_node(page_to_nid(page));
-		if (node == MAX_NUMNODES)
-			node = first_online_node;
+	if (PageHuge(page))
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
-					    node);
-	}
+					    next_node_in(page_to_nid(page),
+							 node_online_map));
 
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/slab.c b/mm/slab.c
index d81565a92864..c11bf5007952 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 
 static void init_reap_node(int cpu)
 {
-	int node;
-
-	node = next_node(cpu_to_mem(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	per_cpu(slab_reap_node, cpu) = node;
+	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+						    node_online_map);
 }
 
 static void next_reap_node(void)
 {
 	int node = __this_cpu_read(slab_reap_node);
 
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
+	node = next_node_in(node, node_online_map);
 	__this_cpu_write(slab_reap_node, node);
 }
 
-- 
cgit v1.2.3


From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001
From: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Date: Thu, 19 May 2016 17:11:04 -0700
Subject: mm/hugetlb: introduce hugetlb_bad_size()

When any unsupported hugepage size is specified, 'hugepagesz=' and
'hugepages=' should be ignored during command line parsing until any
supported hugepage size is found.  But currently incorrect number of
hugepages are allocated when unsupported size is specified as it fails
to ignore the 'hugepages=' command.

Test case:

Note that this is specific to x86 architecture.

Boot the kernel with command line option 'hugepagesz=256M hugepages=X'.
After boot, dmesg output shows that X number of hugepages of the size 2M
is pre-allocated instead of 0.

So, to handle such command line options, introduce new routine
hugetlb_bad_size.  The routine hugetlb_bad_size sets the global variable
parsed_valid_hugepagesz.  We are using parsed_valid_hugepagesz to save
the state when unsupported hugepagesize is found so that we can ignore
the 'hugepages=' parameters after that and then reset the variable when
supported hugepage size is found.

The routine hugetlb_bad_size can be called while setting 'hugepagesz='
parameter in an architecture specific code.

Signed-off-by: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  1 +
 mm/hugetlb.c            | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d953c2542a8..e44c57876e89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
+void __init hugetlb_bad_size(void);
 void __init hugetlb_add_hstate(unsigned order);
 struct hstate *size_to_hstate(unsigned long size);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fb37ef810655..0adb74d0a4e1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
 static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -2659,6 +2660,11 @@ static int __init hugetlb_init(void)
 subsys_initcall(hugetlb_init);
 
 /* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+	parsed_valid_hugepagesz = false;
+}
+
 void __init hugetlb_add_hstate(unsigned int order)
 {
 	struct hstate *h;
@@ -2691,11 +2697,17 @@ static int __init hugetlb_nrpages_setup(char *s)
 	unsigned long *mhp;
 	static unsigned long *last_mhp;
 
+	if (!parsed_valid_hugepagesz) {
+		pr_warn("hugepages = %s preceded by "
+			"an unsupported hugepagesz, ignoring\n", s);
+		parsed_valid_hugepagesz = true;
+		return 1;
+	}
 	/*
 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
 	 * so this hugepages= parameter goes to the "default hstate".
 	 */
-	if (!hugetlb_max_hstate)
+	else if (!hugetlb_max_hstate)
 		mhp = &default_hstate_max_huge_pages;
 	else
 		mhp = &parsed_hstate->max_huge_pages;
-- 
cgit v1.2.3


From 32f6271dbdc351dce96b11c5f3567bae8188004f Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:23 -0700
Subject: mm/hugetlb: is_vm_hugetlb_page() can return bool

Make is_vm_hugetlb_page() return bool to improve readability due to this
particular function only using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb_inline.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681fbeb35..a4e7ca0f3585 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -5,16 +5,16 @@
 
 #include <linux/mm.h>
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return !!(vma->vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-	return 0;
+	return false;
 }
 
 #endif
-- 
cgit v1.2.3


From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:26 -0700
Subject: mm/memory_hotplug: is_mem_section_removable() can return bool

Make is_mem_section_removable() return bool to improve readability due
to this particular function only using either one or zero as its return
value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 6 +++---
 mm/memory_hotplug.c            | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index adbef586e696..20d8a5d4d133 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern void remove_memory(int nid, u64 start, u64 size);
 
 #else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
 					unsigned long nr_pages)
 {
-	return 0;
+	return false;
 }
 
 static inline void try_offline_node(int nid) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa34431c3f31..b21d8895ea41 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1410,7 +1410,7 @@ static struct page *next_active_pageblock(struct page *page)
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct page *page = pfn_to_page(start_pfn);
 	struct page *end_page = page + nr_pages;
@@ -1418,12 +1418,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 	/* Check the starting page of each pageblock within the range */
 	for (; page < end_page; page = next_active_pageblock(page)) {
 		if (!is_pageblock_removable_nolock(page))
-			return 0;
+			return false;
 		cond_resched();
 	}
 
 	/* All pageblocks in the memory block are likely to be hot-removable */
-	return 1;
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From bb00a789e565b96c52b2224c2280f7ac83175bec Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:29 -0700
Subject: mm/vmalloc.c: is_vmalloc_addr() can return bool

Make is_vmalloc_addr() return bool to improve readability due to this
particular function only using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1193a54ea2b3..5b375133c695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr);
  * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
  * is no special casing required.
  */
-static inline int is_vmalloc_addr(const void *x)
+static inline bool is_vmalloc_addr(const void *x)
 {
 #ifdef CONFIG_MMU
 	unsigned long addr = (unsigned long)x;
 
 	return addr >= VMALLOC_START && addr < VMALLOC_END;
 #else
-	return 0;
+	return false;
 #endif
 }
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3


From 4ee815be1d34a6f254b3d09bdebcb27f294f2bd3 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:32 -0700
Subject: mm/mempolicy.c: vma_migratable() can return bool

Make vma_migratable() return bool due to this particular function only
using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 2696c1f05ed1..6978a99e571f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
 extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 
 /* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+static inline bool vma_migratable(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-		return 0;
+		return false;
 
 #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	if (vma->vm_flags & VM_HUGETLB)
-		return 0;
+		return false;
 #endif
 
 	/*
@@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 	if (vma->vm_file &&
 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
 								< policy_zone)
-			return 0;
-	return 1;
+			return false;
+	return true;
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
-- 
cgit v1.2.3


From 29f9cb53d25cd9916537b44b0af7f0b95a2e4438 Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Thu, 19 May 2016 17:11:57 -0700
Subject: mm/highmem: simplify is_highmem()

is_highmem() can be simplified by use of is_highmem_idx().  This patch
removes redundant code and will make it easier to maintain if the zone
policy is changed or a new zone is added.

(akpm: saves me 25 bytes of text per is_highmem() callsite)

Signed-off-by: Chanho Min <chanho.min@lge.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60df9257cc7..150c6049f961 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -828,10 +828,7 @@ static inline int is_highmem_idx(enum zone_type idx)
 static inline int is_highmem(struct zone *zone)
 {
 #ifdef CONFIG_HIGHMEM
-	int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
-	return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
-	       (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
-		zone_movable_is_highmem());
+	return is_highmem_idx(zone_idx(zone));
 #else
 	return 0;
 #endif
-- 
cgit v1.2.3


From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:12:00 -0700
Subject: mm: uninline page_mapped()

It's huge.  Uninlining it saves 206 bytes per callsite.  Shaves 4924
bytes from the x86_64 allmodconfig vmlinux.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Steve Capper <steve.capper@arm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 21 +--------------------
 mm/util.c          | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5b375133c695..9c2852cabf01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page)
 	return page->index;
 }
 
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
-	int i;
-	if (likely(!PageCompound(page)))
-		return atomic_read(&page->_mapcount) >= 0;
-	page = compound_head(page);
-	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
-		return true;
-	if (PageHuge(page))
-		return false;
-	for (i = 0; i < hpage_nr_pages(page); i++) {
-		if (atomic_read(&page[i]._mapcount) >= 0)
-			return true;
-	}
-	return false;
-}
+bool page_mapped(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/mm/util.c b/mm/util.c
index 6cc81e7b8705..8a1b3a1fb595 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
 	return __page_rmapping(page);
 }
 
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+	int i;
+
+	if (likely(!PageCompound(page)))
+		return atomic_read(&page->_mapcount) >= 0;
+	page = compound_head(page);
+	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+		return true;
+	if (PageHuge(page))
+		return false;
+	for (i = 0; i < hpage_nr_pages(page); i++) {
+		if (atomic_read(&page[i]._mapcount) >= 0)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
 struct anon_vma *page_anon_vma(struct page *page)
 {
 	unsigned long mapping;
-- 
cgit v1.2.3


From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:35 -0700
Subject: mm: update_lru_size warn and reset bad lru_size

Though debug kernels have a VM_BUG_ON to help protect from misaccounting
lru_size, non-debug kernels are liable to wrap it around: and then the
vast unsigned long size draws page reclaim into a loop of repeatedly
doing nothing on an empty list, without even a cond_resched().

That soft lockup looks confusingly like an over-busy reclaim scenario,
with lots of contention on the lru_lock in shrink_inactive_list(): yet
has a totally different origin.

Help differentiate with a custom warning in
mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the
size to avoid the lockup.  But the particular bug which suggested this
change was mine alone, and since fixed.

Make it a WARN_ONCE: the first occurrence is the most informative, a
flurry may follow, yet even when rate-limited little more is learnt.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  2 +-
 mm/memcontrol.c           | 24 ++++++++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 712e8c37a200..d8cea81ab1ac 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
 	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	list_del(&page->lru);
+	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 011dac8ab5d7..6a0199706f00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,38 @@ out:
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
+	long size;
+	bool empty;
 
 	if (mem_cgroup_disabled())
 		return;
 
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
-	*lru_size += nr_pages;
-	VM_BUG_ON((long)(*lru_size) < 0);
+	empty = list_empty(lruvec->lists + lru);
+
+	if (nr_pages < 0)
+		*lru_size += nr_pages;
+
+	size = *lru_size;
+	if (WARN_ONCE(size < 0 || empty != !size,
+		"%s(%p, %d, %d): lru_size %ld but %sempty\n",
+		__func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+		VM_BUG_ON(1);
+		*lru_size = 0;
+	}
+
+	if (nr_pages > 0)
+		*lru_size += nr_pages;
 }
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-- 
cgit v1.2.3


From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:38 -0700
Subject: mm: update_lru_size do the __mod_zone_page_state

Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy
reclaim was removed) that lru_size can be updated by -nr_taken once per
call to isolate_lru_pages(), instead of page by page.

Update it inside isolate_lru_pages(), or at its two callsites? I chose
to update it at the callsites, rearranging and grouping the updates by
nr_taken and nr_scanned together in both.

With one exception, mem_cgroup_update_lru_size(,lru,) is then used where
__mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding
some more calls in a future commit.  Make the code a little smaller and
simpler by incorporating stat update in lru_size update.

The exception was move_active_pages_to_lru(), which aggregated the
pgmoved stat update separately from the individual lru_size updates; but
I still think this a simplification worth making.

However, the __mod_zone_page_state is not peculiar to mem_cgroups: so
better use the name update_lru_size, calls mem_cgroup_update_lru_size
when CONFIG_MEMCG.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 include/linux/mm_inline.h  | 24 ++++++++++++++++++------
 mm/memcontrol.c            |  2 ++
 mm/vmscan.c                | 23 ++++++++++-------------
 4 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1191d79aa495..94da96738df3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return 0;
 }
 
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-			      int increment)
-{
-}
-
 static inline unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			     int nid, unsigned int lru_mask)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index d8cea81ab1ac..5bd29ba4f174 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
 	return !PageSwapBacked(page);
 }
 
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+	__update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	update_lru_size(lruvec, lru, hpage_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
 	list_del(&page->lru);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+	update_lru_size(lruvec, lru, -hpage_nr_pages(page));
 }
 
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a0199706f00..1b40dcad2b90 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1035,6 +1035,8 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 	long size;
 	bool empty;
 
+	__update_lru_size(lruvec, lru, nr_pages);
+
 	if (mem_cgroup_disabled())
 		return;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d3a02ac3eed7..dcfdfc1a0942 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
 					!list_empty(src); scan++) {
 		struct page *page;
-		int nr_pages;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			nr_pages = hpage_nr_pages(page);
-			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+			nr_taken += hpage_nr_pages(page);
 			list_move(&page->lru, dst);
-			nr_taken += nr_pages;
 			break;
 
 		case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, isolate_mode, lru);
 
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+	update_lru_size(lruvec, lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	reclaim_stat->recent_scanned[file] += nr_taken;
-
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 		SetPageLRU(page);
 
 		nr_pages = hpage_nr_pages(page);
-		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+		update_lru_size(lruvec, lru, nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 		pgmoved += nr_pages;
 
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 				list_add(&page->lru, pages_to_free);
 		}
 	}
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
 }
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, isolate_mode, lru);
-	if (global_reclaim(sc))
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 
+	update_lru_size(lruvec, lru, -nr_taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
+	if (global_reclaim(sc))
+		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
-- 
cgit v1.2.3


From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:44 -0700
Subject: tmpfs: preliminary minor tidyups

Make a few cleanups in mm/shmem.c, before going on to complicate it.

shmem_alloc_page() will become more complicated: we can't afford to to
have that complication duplicated between a CONFIG_NUMA version and a
!CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a
single shmem_swapin() and a single shmem_alloc_page().

Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA
configurations, but eliminating it is a larger cleanup: I have an
alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and
bug-prone, and changed yet again since my last version.

Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to
shmem_alloc_page(): that SwapBacked flag will be useful in future, to
help to distinguish different cases appropriately.

And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of
little use (IIRC it dates back to when shmem_getpage() returned the page
unlocked): kill it and do the necessary in shmem_file_read_iter().

But an arm64 build then complained that info may be uninitialized (where
shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and
advancing to an "sgp <= SGP_CACHE" test jogged it back to reality.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  6 +++++
 mm/shmem.c                | 69 ++++++++++++++++++-----------------------------
 2 files changed, 32 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6978a99e571f..4429d255c8ab 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
 {
 }
 
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	return NULL;
+}
+
 #define vma_policy(vma) NULL
 
 static inline int
diff --git a/mm/shmem.c b/mm/shmem.c
index 9e609d58df73..6d2de2c1bf11 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
-	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
 	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
@@ -169,7 +168,7 @@ static inline int shmem_reacct_size(unsigned long flags,
 
 /*
  * ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
@@ -947,8 +946,7 @@ redirty:
 	return 0;
 }
 
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 	char buffer[64];
@@ -972,7 +970,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 	}
 	return mpol;
 }
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+	return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
 
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1017,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 
-	page = alloc_page_vma(gfp, &pvma, 0);
+	page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+	if (page) {
+		__SetPageLocked(page);
+		__SetPageSwapBacked(page);
+	}
 
 	/* Drop reference taken by mpol_shared_policy_lookup() */
 	mpol_cond_put(pvma.vm_policy);
 
 	return page;
 }
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
-	return NULL;
-}
-#endif
 
 /*
  * When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,8 +1071,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	copy_highpage(newpage, oldpage);
 	flush_dcache_page(newpage);
 
-	__SetPageLocked(newpage);
-	__SetPageSwapBacked(newpage);
 	SetPageUptodate(newpage);
 	set_page_private(newpage, swap_index);
 	SetPageSwapCache(newpage);
@@ -1155,7 +1140,7 @@ repeat:
 		page = NULL;
 	}
 
-	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
 		goto unlock;
@@ -1275,9 +1260,6 @@ repeat:
 			error = -ENOMEM;
 			goto decused;
 		}
-
-		__SetPageLocked(page);
-		__SetPageSwapBacked(page);
 		if (sgp == SGP_WRITE)
 			__SetPageReferenced(page);
 
@@ -1321,12 +1303,10 @@ clear:
 			flush_dcache_page(page);
 			SetPageUptodate(page);
 		}
-		if (sgp == SGP_DIRTY)
-			set_page_dirty(page);
 	}
 
 	/* Perhaps the file has been truncated since we checked */
-	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		if (alloced) {
 			ClearPageDirty(page);
@@ -1633,7 +1613,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
 	 */
 	if (!iter_is_iovec(to))
-		sgp = SGP_DIRTY;
+		sgp = SGP_CACHE;
 
 	index = *ppos >> PAGE_SHIFT;
 	offset = *ppos & ~PAGE_MASK;
@@ -1659,8 +1639,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				error = 0;
 			break;
 		}
-		if (page)
+		if (page) {
+			if (sgp == SGP_CACHE)
+				set_page_dirty(page);
 			unlock_page(page);
+		}
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
-- 
cgit v1.2.3


From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:50 -0700
Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing.  Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.

Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient.  Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.

But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time.  I have not yet worked out why, but
have no evidence that it's actually harmful.  Punt for the moment by
just ignoring the anomaly on those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 14 +++++++++++
 include/linux/vmstat.h      |  4 +++
 kernel/sysctl.c             |  7 ++++++
 mm/vmstat.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
 - panic_on_oom
 - percpu_pagelist_fraction
 - stat_interval
+- stat_refresh
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
 
 ==============================================================
 
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos);
+
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 
 int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "stat_refresh",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0600,
+		.proc_handler	= vmstat_refresh,
+	},
 #endif
 #ifdef CONFIG_MMU
 	{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7de9adacbd9..c831be32a1a3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 static cpumask_var_t cpu_stat_off;
 
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+	refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	long val;
+	int err;
+	int i;
+
+	/*
+	 * The regular update, every sysctl_stat_interval, may come later
+	 * than expected: leaving a significant amount in per_cpu buckets.
+	 * This is particularly misleading when checking a quantity of HUGE
+	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+	 * which can equally be echo'ed to or cat'ted from (by root),
+	 * can be used to update the stats just before reading them.
+	 *
+	 * Oh, and since global_page_state() etc. are so careful to hide
+	 * transiently negative values, report an error here if any of
+	 * the stats is negative, so we know to go looking for imbalance.
+	 */
+	err = schedule_on_each_cpu(refresh_vm_stats);
+	if (err)
+		return err;
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+		val = atomic_long_read(&vm_stat[i]);
+		if (val < 0) {
+			switch (i) {
+			case NR_ALLOC_BATCH:
+			case NR_PAGES_SCANNED:
+				/*
+				 * These are often seen to go negative in
+				 * recent kernels, but not to go permanently
+				 * negative.  Whilst it would be nicer not to
+				 * have exceptions, rooting them out would be
+				 * another task, of rather low priority.
+				 */
+				break;
+			default:
+				pr_warn("%s: %s %ld\n",
+					__func__, vmstat_text[i], val);
+				err = -EINVAL;
+				break;
+			}
+		}
+	}
+	if (err)
+		return err;
+	if (write)
+		*ppos += *lenp;
+	else
+		*lenp = 0;
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
 static void vmstat_update(struct work_struct *w)
 {
 	if (refresh_cpu_vm_stats(true)) {
-- 
cgit v1.2.3


From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:54 -0700
Subject: huge mm: move_huge_pmd does not need new_vma

Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was
a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from
the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as
it was in the old vma, alignment and size permitting.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 +---
 mm/huge_memory.c        | 7 ++-----
 mm/mremap.c             | 5 ++---
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d7b9e5346fba..419fb9e03447 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
-			 struct vm_area_struct *new_vma,
-			 unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8ac8f582fd8..66675eed67be 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return 1;
 }
 
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
-		  unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	pmd_t pmd;
-
 	struct mm_struct *mm = vma->vm_mm;
 
 	if ((old_addr & ~HPAGE_PMD_MASK) ||
 	    (new_addr & ~HPAGE_PMD_MASK) ||
-	    old_end - old_addr < HPAGE_PMD_SIZE ||
-	    (new_vma->vm_flags & VM_NOHUGEPAGE))
+	    old_end - old_addr < HPAGE_PMD_SIZE)
 		return false;
 
 	/*
diff --git a/mm/mremap.c b/mm/mremap.c
index 3fa0a467df66..7d98fe1adc12 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -198,9 +198,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 				/* See comment in move_ptes() */
 				if (need_rmap_locks)
 					anon_vma_lock_write(vma->anon_vma);
-				moved = move_huge_pmd(vma, new_vma, old_addr,
-						    new_addr, old_end,
-						    old_pmd, new_pmd);
+				moved = move_huge_pmd(vma, old_addr, new_addr,
+						    old_end, old_pmd, new_pmd);
 				if (need_rmap_locks)
 					anon_vma_unlock_write(vma->anon_vma);
 				if (moved) {
-- 
cgit v1.2.3


From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 19 May 2016 17:13:12 -0700
Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path

If either the current task is already killed or PF_EXITING or a selected
task is PF_EXITING then the oom killer is suppressed and so is the oom
reaper.  This patch adds try_oom_reaper which checks the given task and
queues it for the oom reaper if that is safe to be done meaning that the
task doesn't share the mm with an alive process.

This might help to release the memory pressure while the task tries to
exit.

[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Raushaniya Maksudova <rmaksudova@parallels.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  8 +++++
 mm/memcontrol.c     |  1 +
 mm/oom_kill.c       | 86 ++++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 77 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 628a43242a34..83b9c39bd8b7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p)
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b40dcad2b90..d71d387868e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1275,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
 		mark_oom_victim(current);
+		try_oom_reaper(current);
 		goto unlock;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 32d8210b8773..850b6ff66bdf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly;
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+	struct task_struct *t;
+
+	for_each_thread(p, t) {
+		struct mm_struct *t_mm = READ_ONCE(t->mm);
+		if (t_mm)
+			return t_mm == mm;
+	}
+	return false;
+}
+
+
 #ifdef CONFIG_MMU
 /*
  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -563,6 +582,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
 	wake_up(&oom_reaper_wait);
 }
 
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+	struct task_struct *p;
+
+	if (!mm)
+		return;
+
+	/*
+	 * There might be other threads/processes which are either not
+	 * dying or even not killable.
+	 */
+	if (atomic_read(&mm->mm_users) > 1) {
+		rcu_read_lock();
+		for_each_process(p) {
+			bool exiting;
+
+			if (!process_shares_mm(p, mm))
+				continue;
+			if (same_thread_group(p, tsk))
+				continue;
+			if (fatal_signal_pending(p))
+				continue;
+
+			/*
+			 * If the task is exiting make sure the whole thread group
+			 * is exiting and cannot acces mm anymore.
+			 */
+			spin_lock_irq(&p->sighand->siglock);
+			exiting = signal_group_exit(p->signal);
+			spin_unlock_irq(&p->sighand->siglock);
+			if (exiting)
+				continue;
+
+			/* Give up */
+			rcu_read_unlock();
+			return;
+		}
+		rcu_read_unlock();
+	}
+
+	wake_oom_reaper(tsk);
+}
+
 static int __init oom_init(void)
 {
 	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -652,24 +718,6 @@ void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-/*
- * task->mm can be NULL if the task is the exited group leader.  So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
-	struct task_struct *t;
-
-	for_each_thread(p, t) {
-		struct mm_struct *t_mm = READ_ONCE(t->mm);
-		if (t_mm)
-			return t_mm == mm;
-	}
-	return false;
-}
-
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
@@ -694,6 +742,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
 		mark_oom_victim(p);
+		try_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -873,6 +922,7 @@ bool out_of_memory(struct oom_control *oc)
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
 		mark_oom_victim(current);
+		try_oom_reaper(current);
 		return true;
 	}
 
-- 
cgit v1.2.3


From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:21 -0700
Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast
 path

The PageAnon check always checks for compound_head but this is a
relatively expensive check if the caller already knows the page is a
head page.  This patch creates a helper and uses it in the page free
path which only operates on head pages.

With this patch and "Only check PageCompound for high-order pages", the
performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                               vanilla           nocompound-v1r20
  Min      alloc-odr0-1               425.00 (  0.00%)           417.00 (  1.88%)
  Min      alloc-odr0-2               313.00 (  0.00%)           308.00 (  1.60%)
  Min      alloc-odr0-4               257.00 (  0.00%)           253.00 (  1.56%)
  Min      alloc-odr0-8               224.00 (  0.00%)           221.00 (  1.34%)
  Min      alloc-odr0-16              208.00 (  0.00%)           205.00 (  1.44%)
  Min      alloc-odr0-32              199.00 (  0.00%)           199.00 (  0.00%)
  Min      alloc-odr0-64              195.00 (  0.00%)           193.00 (  1.03%)
  Min      alloc-odr0-128             192.00 (  0.00%)           191.00 (  0.52%)
  Min      alloc-odr0-256             204.00 (  0.00%)           200.00 (  1.96%)
  Min      alloc-odr0-512             213.00 (  0.00%)           212.00 (  0.47%)
  Min      alloc-odr0-1024            219.00 (  0.00%)           219.00 (  0.00%)
  Min      alloc-odr0-2048            225.00 (  0.00%)           225.00 (  0.00%)
  Min      alloc-odr0-4096            230.00 (  0.00%)           231.00 ( -0.43%)
  Min      alloc-odr0-8192            235.00 (  0.00%)           234.00 (  0.43%)
  Min      alloc-odr0-16384           235.00 (  0.00%)           234.00 (  0.43%)
  Min      free-odr0-1                215.00 (  0.00%)           191.00 ( 11.16%)
  Min      free-odr0-2                152.00 (  0.00%)           136.00 ( 10.53%)
  Min      free-odr0-4                119.00 (  0.00%)           107.00 ( 10.08%)
  Min      free-odr0-8                106.00 (  0.00%)            96.00 (  9.43%)
  Min      free-odr0-16                97.00 (  0.00%)            87.00 ( 10.31%)
  Min      free-odr0-32                91.00 (  0.00%)            83.00 (  8.79%)
  Min      free-odr0-64                89.00 (  0.00%)            81.00 (  8.99%)
  Min      free-odr0-128               88.00 (  0.00%)            80.00 (  9.09%)
  Min      free-odr0-256              106.00 (  0.00%)            95.00 ( 10.38%)
  Min      free-odr0-512              116.00 (  0.00%)           111.00 (  4.31%)
  Min      free-odr0-1024             125.00 (  0.00%)           118.00 (  5.60%)
  Min      free-odr0-2048             133.00 (  0.00%)           126.00 (  5.26%)
  Min      free-odr0-4096             136.00 (  0.00%)           130.00 (  4.41%)
  Min      free-odr0-8192             138.00 (  0.00%)           130.00 (  5.80%)
  Min      free-odr0-16384            137.00 (  0.00%)           130.00 (  5.11%)

There is a sizable boost to the free allocator performance.  While there
is an apparent boost on the allocation side, it's likely a co-incidence
or due to the patches slightly reducing cache footprint.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 7 ++++++-
 mm/page_alloc.c            | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b052aa7b5b7..a61e06e5fbce 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #define PAGE_MAPPING_KSM	2
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
 
+static __always_inline int PageAnonHead(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
 static __always_inline int PageAnon(struct page *page)
 {
 	page = compound_head(page);
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return PageAnonHead(page);
 }
 
 #ifdef CONFIG_KSM
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 087ba3e417ec..7be1ce8b6be0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,7 +1048,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 			bad += free_pages_check(page + i);
 		}
 	}
-	if (PageAnon(page))
+	if (PageAnonHead(page))
 		page->mapping = NULL;
 	bad += free_pages_check(page);
 	if (bad)
-- 
cgit v1.2.3


From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:27 -0700
Subject: mm, page_alloc: inline zone_statistics

zone_statistics has one call-site but it's a public function.  Make it
static and inline.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statbranch-v1r20           statinline-v1r20
  Min      alloc-odr0-1               419.00 (  0.00%)           412.00 (  1.67%)
  Min      alloc-odr0-2               305.00 (  0.00%)           301.00 (  1.31%)
  Min      alloc-odr0-4               250.00 (  0.00%)           247.00 (  1.20%)
  Min      alloc-odr0-8               219.00 (  0.00%)           215.00 (  1.83%)
  Min      alloc-odr0-16              203.00 (  0.00%)           199.00 (  1.97%)
  Min      alloc-odr0-32              195.00 (  0.00%)           191.00 (  2.05%)
  Min      alloc-odr0-64              191.00 (  0.00%)           187.00 (  2.09%)
  Min      alloc-odr0-128             189.00 (  0.00%)           185.00 (  2.12%)
  Min      alloc-odr0-256             198.00 (  0.00%)           193.00 (  2.53%)
  Min      alloc-odr0-512             210.00 (  0.00%)           207.00 (  1.43%)
  Min      alloc-odr0-1024            216.00 (  0.00%)           213.00 (  1.39%)
  Min      alloc-odr0-2048            221.00 (  0.00%)           220.00 (  0.45%)
  Min      alloc-odr0-4096            227.00 (  0.00%)           226.00 (  0.44%)
  Min      alloc-odr0-8192            232.00 (  0.00%)           229.00 (  1.29%)
  Min      alloc-odr0-16384           232.00 (  0.00%)           229.00 (  1.29%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  2 --
 mm/page_alloc.c        | 31 +++++++++++++++++++++++++++++++
 mm/vmstat.c            | 29 -----------------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 02fce415b3d9..d2da8e053210 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 #ifdef CONFIG_NUMA
 
 extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
 
 #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
 
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7be1ce8b6be0..36384baa74e1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2354,6 +2354,37 @@ int split_free_page(struct page *page)
 	return nr_pages;
 }
 
+/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+								gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+	int local_nid = numa_node_id();
+	enum zone_stat_item local_stat = NUMA_LOCAL;
+
+	if (unlikely(flags & __GFP_OTHER_NODE)) {
+		local_stat = NUMA_OTHER;
+		local_nid = preferred_zone->node;
+	}
+
+	if (z->node == local_nid) {
+		__inc_zone_state(z, NUMA_HIT);
+		__inc_zone_state(z, local_stat);
+	} else {
+		__inc_zone_state(z, NUMA_MISS);
+		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
+	}
+#endif
+}
+
 /*
  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
  */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d585de27e960..f1a73bfb77b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -569,35 +569,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 #endif
 
 #ifdef CONFIG_NUMA
-/*
- * zonelist = the list of zones passed to the allocator
- * z 	    = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
-	int local_nid = numa_node_id();
-	enum zone_stat_item local_stat = NUMA_LOCAL;
-
-	if (unlikely(flags & __GFP_OTHER_NODE)) {
-		local_stat = NUMA_OTHER;
-		local_nid = preferred_zone->node;
-	}
-
-	if (z->node == local_nid) {
-		__inc_zone_state(z, NUMA_HIT);
-		__inc_zone_state(z, local_stat);
-	} else {
-		__inc_zone_state(z, NUMA_MISS);
-		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
-	}
-}
-
 /*
  * Determine the per node value of a stat item.
  */
-- 
cgit v1.2.3


From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:30 -0700
Subject: mm, page_alloc: inline the fast path of the zonelist iterator

The page allocator iterates through a zonelist for zones that match the
addressing limitations and nodemask of the caller but many allocations
will not be restricted.  Despite this, there is always functional call
overhead which builds up.

This patch inlines the optimistic basic case and only calls the iterator
function for the complex case.  A hindrance was the fact that
cpuset_current_mems_allowed is used in the fastpath as the allowed
nodemask even though all nodes are allowed on most systems.  The patch
handles this by only considering cpuset_current_mems_allowed if a cpuset
exists.  As well as being faster in the fast-path, this removes some
junk in the slowpath.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statinline-v1r20              optiter-v1r20
  Min      alloc-odr0-1               412.00 (  0.00%)           382.00 (  7.28%)
  Min      alloc-odr0-2               301.00 (  0.00%)           282.00 (  6.31%)
  Min      alloc-odr0-4               247.00 (  0.00%)           233.00 (  5.67%)
  Min      alloc-odr0-8               215.00 (  0.00%)           203.00 (  5.58%)
  Min      alloc-odr0-16              199.00 (  0.00%)           188.00 (  5.53%)
  Min      alloc-odr0-32              191.00 (  0.00%)           182.00 (  4.71%)
  Min      alloc-odr0-64              187.00 (  0.00%)           177.00 (  5.35%)
  Min      alloc-odr0-128             185.00 (  0.00%)           175.00 (  5.41%)
  Min      alloc-odr0-256             193.00 (  0.00%)           184.00 (  4.66%)
  Min      alloc-odr0-512             207.00 (  0.00%)           197.00 (  4.83%)
  Min      alloc-odr0-1024            213.00 (  0.00%)           203.00 (  4.69%)
  Min      alloc-odr0-2048            220.00 (  0.00%)           209.00 (  5.00%)
  Min      alloc-odr0-4096            226.00 (  0.00%)           214.00 (  5.31%)
  Min      alloc-odr0-8192            229.00 (  0.00%)           218.00 (  4.80%)
  Min      alloc-odr0-16384           229.00 (  0.00%)           219.00 (  4.37%)

perf indicated that next_zones_zonelist disappeared in the profile and
__next_zones_zonelist did not appear.  This is expected as the
micro-benchmark would hit the inlined fast-path every time.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +++++++++++--
 mm/mmzone.c            |  2 +-
 mm/page_alloc.c        | 26 +++++++++-----------------
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 150c6049f961..cfcd7723edb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
 #endif /* CONFIG_NUMA */
 }
 
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes);
+
 /**
  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
  * @z - The cursor used as a starting point for the search
@@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  * being examined. It should be advanced by one before calling
  * next_zones_zonelist again.
  */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes);
+					nodemask_t *nodes)
+{
+	if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+		return z;
+	return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 52687fb4de6f..5652be858e5e 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 }
 
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
 					nodemask_t *nodes)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36384baa74e1..789e5f065e8d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3192,17 +3192,6 @@ retry:
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
-	/*
-	 * Find the true preferred zone if the allocation is unconstrained by
-	 * cpusets.
-	 */
-	if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
-		struct zoneref *preferred_zoneref;
-		preferred_zoneref = first_zones_zonelist(ac->zonelist,
-				ac->high_zoneidx, NULL, &ac->preferred_zone);
-		ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
-	}
-
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3358,14 +3347,21 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
-	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
+		.zonelist = zonelist,
 		.nodemask = nodemask,
 		.migratetype = gfpflags_to_migratetype(gfp_mask),
 	};
 
+	if (cpusets_enabled()) {
+		alloc_flags |= ALLOC_CPUSET;
+		if (!ac.nodemask)
+			ac.nodemask = &cpuset_current_mems_allowed;
+	}
+
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
@@ -3389,16 +3385,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
-	/* We set it here, as __alloc_pages_slowpath might have changed it */
-	ac.zonelist = zonelist;
-
 	/* Dirty zone balancing only done in the fast path */
 	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/* The preferred zone is used for statistics later */
 	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-				ac.nodemask ? : &cpuset_current_mems_allowed,
-				&ac.preferred_zone);
+				ac.nodemask, &ac.preferred_zone);
 	if (!ac.preferred_zone)
 		goto out;
 	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
-- 
cgit v1.2.3


From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:38 -0700
Subject: mm, page_alloc: convert alloc_flags to unsigned

alloc_flags is a bitmask of flags but it is signed which does not
necessarily generate the best code depending on the compiler.  Even
without an impact, it makes more sense that this be unsigned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  6 +++---
 include/linux/mmzone.h     |  3 ++-
 mm/compaction.c            | 12 +++++++-----
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index d7c8de583a23..242b660f64e6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended);
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx);
+		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
 extern bool compaction_deferred(struct zone *zone, int order);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cfcd7723edb6..327f0fa1e1ce 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
-		unsigned long mark, int classzone_idx, int alloc_flags);
+		unsigned long mark, int classzone_idx,
+		unsigned int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx);
 enum memmap_context {
diff --git a/mm/compaction.c b/mm/compaction.c
index 7487067b4613..8f339ca25621 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1313,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_CONTINUE - If compaction should run now
  */
 static unsigned long __compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+					unsigned int alloc_flags,
+					int classzone_idx)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1358,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 }
 
 unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+					unsigned int alloc_flags,
+					int classzone_idx)
 {
 	unsigned long ret;
 
@@ -1530,7 +1532,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
-		int alloc_flags, int classzone_idx)
+		unsigned int alloc_flags, int classzone_idx)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1571,8 +1573,8 @@ int sysctl_extfrag_threshold = 500;
  * This is the main entry point for direct page compaction.
  */
 unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended)
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended)
 {
 	int may_enter_fs = gfp_mask & __GFP_FS;
 	int may_perform_io = gfp_mask & __GFP_IO;
diff --git a/mm/internal.h b/mm/internal.h
index 098a89e3b97c..114593aab55c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -175,7 +175,7 @@ struct compact_control {
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
-	const int alloc_flags;		/* alloc flags of a direct compactor */
+	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
 	const int classzone_idx;	/* zone index of a direct compactor */
 	struct zone *zone;
 	int contended;			/* Signal need_sched() or lock
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f328cfb137d..094587a4ed81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1525,7 +1525,7 @@ static inline bool free_pages_prezeroed(bool poisoned)
 }
 
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-								int alloc_flags)
+							unsigned int alloc_flags)
 {
 	int i;
 	bool poisoned = true;
@@ -2391,7 +2391,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
-			gfp_t gfp_flags, int alloc_flags, int migratetype)
+			gfp_t gfp_flags, unsigned int alloc_flags,
+			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
@@ -2545,12 +2546,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * to check in the allocation paths if no pages are free.
  */
 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx, int alloc_flags,
+			unsigned long mark, int classzone_idx,
+			unsigned int alloc_flags,
 			long free_pages)
 {
 	long min = mark;
 	int o;
-	const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+	const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
 
 	/* free_pages may go negative - that's OK */
 	free_pages -= (1 << order) - 1;
@@ -2613,7 +2615,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
 }
 
 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
-		      int classzone_idx, int alloc_flags)
+		      int classzone_idx, unsigned int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
@@ -2957,7 +2959,7 @@ out:
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
@@ -3013,7 +3015,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
@@ -3053,7 +3055,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
@@ -3092,10 +3094,10 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 		wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
 }
 
-static inline int
+static inline unsigned int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3156,7 +3158,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
 	struct page *page = NULL;
-	int alloc_flags;
+	unsigned int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
@@ -3348,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
-	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
-- 
cgit v1.2.3


From 09940a4f1e816abe3248fa0d185fc0e7f54c8c12 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:53 -0700
Subject: mm, page_alloc: simplify last cpupid reset

The current reset unnecessarily clears flags and makes pointless
calculations.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c2852cabf01..2b97be1147ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid);
 
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
-
-	page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
-	page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+	page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
 #else /* !CONFIG_NUMA_BALANCING */
-- 
cgit v1.2.3


From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:10 -0700
Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice

The allocator fast path looks up the first usable zone in a zonelist and
then get_page_from_freelist does the same job in the zonelist iterator.
This patch preserves the necessary information.

                                             4.6.0-rc2                  4.6.0-rc2
                                        fastmark-v1r20             initonce-v1r20
  Min      alloc-odr0-1               364.00 (  0.00%)           359.00 (  1.37%)
  Min      alloc-odr0-2               262.00 (  0.00%)           260.00 (  0.76%)
  Min      alloc-odr0-4               214.00 (  0.00%)           214.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           186.00 (  0.00%)
  Min      alloc-odr0-16              173.00 (  0.00%)           173.00 (  0.00%)
  Min      alloc-odr0-32              165.00 (  0.00%)           165.00 (  0.00%)
  Min      alloc-odr0-64              161.00 (  0.00%)           162.00 ( -0.62%)
  Min      alloc-odr0-128             159.00 (  0.00%)           161.00 ( -1.26%)
  Min      alloc-odr0-256             168.00 (  0.00%)           170.00 ( -1.19%)
  Min      alloc-odr0-512             180.00 (  0.00%)           181.00 ( -0.56%)
  Min      alloc-odr0-1024            190.00 (  0.00%)           190.00 (  0.00%)
  Min      alloc-odr0-2048            196.00 (  0.00%)           196.00 (  0.00%)
  Min      alloc-odr0-4096            202.00 (  0.00%)           202.00 (  0.00%)
  Min      alloc-odr0-8192            206.00 (  0.00%)           205.00 (  0.49%)
  Min      alloc-odr0-16384           206.00 (  0.00%)           205.00 (  0.49%)

The benefit is negligible and the results are within the noise but each
cycle counts.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c            | 10 +++++-----
 include/linux/mmzone.h | 18 +++++++++++-------
 mm/internal.h          |  2 +-
 mm/mempolicy.c         | 19 ++++++++++---------
 mm/page_alloc.c        | 34 ++++++++++++++++------------------
 5 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index af0d9a82a8ed..754813a6962b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
  */
 static void free_more_memory(void)
 {
-	struct zone *zone;
+	struct zoneref *z;
 	int nid;
 
 	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 	yield();
 
 	for_each_online_node(nid) {
-		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS), NULL,
-						&zone);
-		if (zone)
+
+		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+						gfp_zone(GFP_NOFS), NULL);
+		if (z->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS, NULL);
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 327f0fa1e1ce..4b28d2f8125e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone)
+					nodemask_t *nodes)
 {
-	struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+	return next_zones_zonelist(zonelist->_zonerefs,
 							highest_zoneidx, nodes);
-	*zone = zonelist_zone(z);
-	return z;
 }
 
 /**
@@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
  * within a given nodemask
  */
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
-	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
+	for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);	\
 		zone;							\
 		z = next_zones_zonelist(++z, highidx, nodemask),	\
-			zone = zonelist_zone(z))			\
+			zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+	for (zone = z->zone;	\
+		zone;							\
+		z = next_zones_zonelist(++z, highidx, nodemask),	\
+			zone = zonelist_zone(z))
+
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/mm/internal.h b/mm/internal.h
index 114593aab55c..d1ddd71c1bbf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -102,7 +102,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 struct alloc_context {
 	struct zonelist *zonelist;
 	nodemask_t *nodemask;
-	struct zone *preferred_zone;
+	struct zoneref *preferred_zoneref;
 	int classzone_idx;
 	int migratetype;
 	enum zone_type high_zoneidx;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7f80ebcd6552..297d6854f849 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1739,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
 		return interleave_nodes(policy);
 
 	case MPOL_BIND: {
+		struct zoneref *z;
+
 		/*
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
 		struct zonelist *zonelist;
-		struct zone *zone;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
 		zonelist = &NODE_DATA(node)->node_zonelists[0];
-		(void)first_zones_zonelist(zonelist, highest_zoneidx,
-							&policy->v.nodes,
-							&zone);
-		return zone ? zone->node : node;
+		z = first_zones_zonelist(zonelist, highest_zoneidx,
+							&policy->v.nodes);
+		return z->zone ? z->zone->node : node;
 	}
 
 	default:
@@ -2266,7 +2266,7 @@ static void sp_free(struct sp_node *n)
 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol;
-	struct zone *zone;
+	struct zoneref *z;
 	int curnid = page_to_nid(page);
 	unsigned long pgoff;
 	int thiscpu = raw_smp_processor_id();
@@ -2298,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_BIND:
+
 		/*
 		 * allows binding to multiple nodes.
 		 * use current page if in policy nodemask,
@@ -2306,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		 */
 		if (node_isset(curnid, pol->v.nodes))
 			goto out;
-		(void)first_zones_zonelist(
+		z = first_zones_zonelist(
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
-				&pol->v.nodes, &zone);
-		polnid = zone->node;
+				&pol->v.nodes);
+		polnid = z->zone->node;
 		break;
 
 	default:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 732875b1bdfb..dba8cfd0b2d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2704,7 +2704,7 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
-	struct zoneref *z;
+	struct zoneref *z = ac->preferred_zoneref;
 	struct zone *zone;
 	bool fair_skipped = false;
 	bool apply_fair = (alloc_flags & ALLOC_FAIR);
@@ -2714,7 +2714,7 @@ zonelist_scan:
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 		struct page *page;
 		unsigned long mark;
@@ -2734,7 +2734,7 @@ zonelist_scan:
 				fair_skipped = true;
 				continue;
 			}
-			if (!zone_local(ac->preferred_zone, zone)) {
+			if (!zone_local(ac->preferred_zoneref->zone, zone)) {
 				if (fair_skipped)
 					goto reset_fair;
 				apply_fair = false;
@@ -2780,7 +2780,7 @@ zonelist_scan:
 				goto try_this_zone;
 
 			if (zone_reclaim_mode == 0 ||
-			    !zone_allows_reclaim(ac->preferred_zone, zone))
+			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;
 
 			ret = zone_reclaim(zone, gfp_mask, order);
@@ -2802,7 +2802,7 @@ zonelist_scan:
 		}
 
 try_this_zone:
-		page = buffered_rmqueue(ac->preferred_zone, zone, order,
+		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
 			if (prep_new_page(page, order, gfp_mask, alloc_flags))
@@ -2831,7 +2831,7 @@ try_this_zone:
 reset_fair:
 		apply_fair = false;
 		fair_skipped = false;
-		reset_alloc_batches(ac->preferred_zone);
+		reset_alloc_batches(ac->preferred_zoneref->zone);
 		goto zonelist_scan;
 	}
 
@@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
 						ac->high_zoneidx, ac->nodemask)
-		wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+		wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref));
 }
 
 static inline unsigned int
@@ -3332,7 +3332,7 @@ retry:
 	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
 	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
 		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
 		goto retry;
 	}
 
@@ -3370,7 +3370,6 @@ struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
-	struct zoneref *preferred_zoneref;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
@@ -3416,14 +3415,14 @@ retry_cpuset:
 	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/* The preferred zone is used for statistics later */
-	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-				ac.nodemask, &ac.preferred_zone);
-	if (!ac.preferred_zone) {
+	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+					ac.high_zoneidx, ac.nodemask);
+	if (!ac.preferred_zoneref) {
 		page = NULL;
 		goto no_zone;
 	}
 
-	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+	ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref);
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -4462,13 +4461,12 @@ static void build_zonelists(pg_data_t *pgdat)
  */
 int local_memory_node(int node)
 {
-	struct zone *zone;
+	struct zoneref *z;
 
-	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
-				   NULL,
-				   &zone);
-	return zone->node;
+				   NULL);
+	return z->zone->node;
 }
 #endif
 
-- 
cgit v1.2.3


From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:27 -0700
Subject: mm, page_alloc: inline pageblock lookup in page free fast paths

The function call overhead of get_pfnblock_flags_mask() is measurable in
the page free paths.  This patch uses an inlined version that is faster.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |   7 --
 mm/page_alloc.c        | 188 ++++++++++++++++++++++++++-----------------------
 mm/page_owner.c        |   2 +-
 mm/vmstat.c            |   2 +-
 4 files changed, 102 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4b28d2f8125e..c60db2096fd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
 	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
 			PB_migrate_end, MIGRATETYPE_MASK)
 
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
-	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
-	return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
-					MIGRATETYPE_MASK);
-}
-
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 822ce86fc883..bdf7a13311b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
 }
 #endif
 
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+							unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	return __pfn_to_section(pfn)->pageblock_flags;
+#else
+	return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	pfn &= (PAGES_PER_SECTION-1);
+	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+					unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	unsigned long *bitmap;
+	unsigned long bitidx, word_bitidx;
+	unsigned long word;
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = bitidx / BITS_PER_LONG;
+	bitidx &= (BITS_PER_LONG-1);
+
+	word = bitmap[word_bitidx];
+	bitidx += end_bitidx;
+	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+					unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	unsigned long *bitmap;
+	unsigned long bitidx, word_bitidx;
+	unsigned long old_word, word;
+
+	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = bitidx / BITS_PER_LONG;
+	bitidx &= (BITS_PER_LONG-1);
+
+	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+	bitidx += end_bitidx;
+	mask <<= (BITS_PER_LONG - bitidx - 1);
+	flags <<= (BITS_PER_LONG - bitidx - 1);
+
+	word = READ_ONCE(bitmap[word_bitidx]);
+	for (;;) {
+		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+		if (word == old_word)
+			break;
+		word = old_word;
+	}
+}
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
@@ -6831,94 +6931,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 	return table;
 }
 
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
-							unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
-#else
-	return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	pfn &= (PAGES_PER_SECTION-1);
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
-	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-					unsigned long end_bitidx,
-					unsigned long mask)
-{
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
-	unsigned long word;
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	word = bitmap[word_bitidx];
-	bitidx += end_bitidx;
-	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-					unsigned long pfn,
-					unsigned long end_bitidx,
-					unsigned long mask)
-{
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
-	unsigned long old_word, word;
-
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
-
-	bitidx += end_bitidx;
-	mask <<= (BITS_PER_LONG - bitidx - 1);
-	flags <<= (BITS_PER_LONG - bitidx - 1);
-
-	word = READ_ONCE(bitmap[word_bitidx]);
-	for (;;) {
-		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
-		if (word == old_word)
-			break;
-		word = old_word;
-	}
-}
-
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 438768c092ac..792b56da13d8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		goto err;
 
 	/* Print information relevant to grouping pages by mobility */
-	pageblock_mt = get_pfnblock_migratetype(page, pfn);
+	pageblock_mt = get_pageblock_migratetype(page);
 	page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
 	ret += snprintf(kbuf + ret, count - ret,
 			"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f1a73bfb77b5..5b72a8ad2813 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1041,7 +1041,7 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
 		page = pfn_to_page(pfn);
-		pageblock_mt = get_pfnblock_migratetype(page, pfn);
+		pageblock_mt = get_pageblock_migratetype(page);
 
 		for (; pfn < block_end_pfn; pfn++) {
 			if (!pfn_valid_within(pfn))
-- 
cgit v1.2.3


From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:30 -0700
Subject: cpuset: use static key better and convert to new API

An important function for cpusets is cpuset_node_allowed(), which
optimizes on the fact if there's a single root CPU set, it must be
trivially allowed.  But the check "nr_cpusets() <= 1" doesn't use the
cpusets_enabled_key static key the right way where static keys eliminate
branching overhead with jump labels.

This patch converts it so that static key is used properly.  It's also
switched to the new static key API and the checking functions are
converted to return bool instead of int.  We also provide a new variant
__cpuset_zone_allowed() which expects that the static key check was
already done and they key was enabled.  This is needed for
get_page_from_freelist() where we want to also avoid the relatively
slower check when ALLOC_CPUSET is not set in alloc_flags.

The impact on the page allocator microbenchmark is less than expected
but the cleanup in itself is worthwhile.

                                             4.6.0-rc2                  4.6.0-rc2
                                       multcheck-v1r20               cpuset-v1r20
  Min      alloc-odr0-1               348.00 (  0.00%)           348.00 (  0.00%)
  Min      alloc-odr0-2               254.00 (  0.00%)           254.00 (  0.00%)
  Min      alloc-odr0-4               213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-16              173.00 (  0.00%)           171.00 (  1.16%)
  Min      alloc-odr0-32              166.00 (  0.00%)           163.00 (  1.81%)
  Min      alloc-odr0-64              162.00 (  0.00%)           159.00 (  1.85%)
  Min      alloc-odr0-128             160.00 (  0.00%)           157.00 (  1.88%)
  Min      alloc-odr0-256             169.00 (  0.00%)           166.00 (  1.78%)
  Min      alloc-odr0-512             180.00 (  0.00%)           180.00 (  0.00%)
  Min      alloc-odr0-1024            188.00 (  0.00%)           187.00 (  0.53%)
  Min      alloc-odr0-2048            194.00 (  0.00%)           193.00 (  0.52%)
  Min      alloc-odr0-4096            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-8192            202.00 (  0.00%)           201.00 (  0.50%)
  Min      alloc-odr0-16384           203.00 (  0.00%)           202.00 (  0.49%)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++--------------
 kernel/cpuset.c        | 14 +++++++-------
 mm/page_alloc.c        |  2 +-
 3 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..bfc204e70338 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
 
 #ifdef CONFIG_CPUSETS
 
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
 static inline bool cpusets_enabled(void)
 {
-	return static_key_false(&cpusets_enabled_key);
+	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
 static inline int nr_cpusets(void)
 {
 	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key) + 1;
+	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
 static inline void cpuset_inc(void)
 {
-	static_key_slow_inc(&cpusets_enabled_key);
+	static_branch_inc(&cpusets_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-	static_key_slow_dec(&cpusets_enabled_key);
+	static_branch_dec(&cpusets_enabled_key);
 }
 
 extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+	if (cpusets_enabled())
+		return __cpuset_node_allowed(node, gfp_mask);
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	if (cpusets_enabled())
+		return __cpuset_zone_allowed(z, gfp_mask);
+	return true;
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return true;
 }
 
 static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 611cc69af8f0..73e93e53884d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /* See "Frequency meter" comments, below. */
 
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
 	if (in_interrupt())
-		return 1;
+		return true;
 	if (node_isset(node, current->mems_allowed))
-		return 1;
+		return true;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return 1;
+		return true;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-		return 0;
+		return false;
 
 	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return 1;
+		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	spin_lock_irqsave(&callback_lock, flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdf7a13311b5..39c441bb8d61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2859,7 +2859,7 @@ zonelist_scan:
 
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
-			!cpuset_zone_allowed(zone, gfp_mask))
+			!__cpuset_zone_allowed(zone, gfp_mask))
 				continue;
 		/*
 		 * Distribute pages in proportion to the individual
-- 
cgit v1.2.3


From 44bfc42e94cd76a0bd44f3fce98d4a7b76f31bc0 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Wed, 4 May 2016 14:35:48 +0100
Subject: KVM: arm/arm64: move GICv2 emulation defines into arm-gic-v3.h

As (some) GICv3 hosts can emulate a GICv2, some GICv2 specific masks
for the list register definition also apply to GICv3 LRs.
At the moment we have those definitions in the KVM VGICv3
implementation, so let's move them into the GICv3 header file to
have them automatically defined.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 5 +++++
 virt/kvm/arm/vgic-v3.c             | 8 +-------
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index d5d798b35c1f..ec938d14da5d 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -276,6 +276,11 @@
 #define ICH_LR_PHYS_ID_SHIFT		32
 #define ICH_LR_PHYS_ID_MASK		(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
 
+/* These are for GICv2 emulation only */
+#define GICH_LR_VIRTUALID		(0x3ffUL << 0)
+#define GICH_LR_PHYSID_CPUID_SHIFT	(10)
+#define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
+
 #define ICH_MISR_EOI			(1 << 0)
 #define ICH_MISR_U			(1 << 1)
 
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index c02a1b1cf855..75b02fa86436 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -29,12 +29,6 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
 
-/* These are for GICv2 emulation only */
-#define GICH_LR_VIRTUALID		(0x3ffUL << 0)
-#define GICH_LR_PHYSID_CPUID_SHIFT	(10)
-#define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
-#define ICH_LR_VIRTUALID_MASK		(BIT_ULL(32) - 1)
-
 static u32 ich_vtr_el2;
 
 static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
@@ -43,7 +37,7 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
 	u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
 
 	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-		lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
+		lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
 	else
 		lr_desc.irq = val & GICH_LR_VIRTUALID;
 
-- 
cgit v1.2.3


From 140b086dd19771410915a924db2e635c2b51a0f4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 26 Nov 2015 17:19:25 +0000
Subject: KVM: arm/arm64: vgic-new: Add GICv2 world switch backend

Processing maintenance interrupts and accessing the list registers
are dependent on the host's GIC version.
Introduce vgic-v2.c to contain GICv2 specific functions.
Implement the GICv2 specific code for syncing the emulation state
into the VGIC registers.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Eric Auger <eric.auger@linaro.org>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@linaro.org>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/irqchip/arm-gic.h |   1 +
 virt/kvm/arm/vgic/vgic-v2.c     | 176 ++++++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.c        |   6 ++
 virt/kvm/arm/vgic/vgic.h        |   6 ++
 4 files changed, 189 insertions(+)
 create mode 100644 virt/kvm/arm/vgic/vgic-v2.c

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 9c940263ca23..be0d26f940af 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -76,6 +76,7 @@
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
 #define GICH_LR_PHYSID_CPUID		(0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PRIORITY_SHIFT		23
 #define GICH_LR_STATE			(3 << 28)
 #define GICH_LR_PENDING_BIT		(1 << 28)
 #define GICH_LR_ACTIVE_BIT		(1 << 29)
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
new file mode 100644
index 000000000000..fb5e65ceffd0
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include "vgic.h"
+
+/*
+ * Call this function to convert a u64 value to an unsigned long * bitmask
+ * in a way that works on both 32-bit and 64-bit LE and BE platforms.
+ *
+ * Warning: Calling this function may modify *val.
+ */
+static unsigned long *u64_to_bitmask(u64 *val)
+{
+#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
+	*val = (*val >> 32) | (*val << 32);
+#endif
+	return (unsigned long *)val;
+}
+
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	if (cpuif->vgic_misr & GICH_MISR_EOI) {
+		u64 eisr = cpuif->vgic_eisr;
+		unsigned long *eisr_bmap = u64_to_bitmask(&eisr);
+		int lr;
+
+		for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) {
+			u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID;
+
+			WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE);
+
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+		}
+	}
+
+	/* check and disable underflow maintenance IRQ */
+	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+	/*
+	 * In the next iterations of the vcpu loop, if we sync the
+	 * vgic state after flushing it, but before entering the guest
+	 * (this happens for pending signals and vmid rollovers), then
+	 * make sure we don't pick up any old maintenance interrupts
+	 * here.
+	 */
+	cpuif->vgic_eisr = 0;
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ *   - transferred as is in case of edge sensitive IRQs
+ *   - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+	int lr;
+
+	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+		u32 val = cpuif->vgic_lr[lr];
+		u32 intid = val & GICH_LR_VIRTUALID;
+		struct vgic_irq *irq;
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+		spin_lock(&irq->irq_lock);
+
+		/* Always preserve the active bit */
+		irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+		/* Edge is the only case where we preserve the pending bit */
+		if (irq->config == VGIC_CONFIG_EDGE &&
+		    (val & GICH_LR_PENDING_BIT)) {
+			irq->pending = true;
+
+			if (vgic_irq_is_sgi(intid)) {
+				u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+				cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+				irq->source |= (1 << cpuid);
+			}
+		}
+
+		/* Clear soft pending state when level IRQs have been acked */
+		if (irq->config == VGIC_CONFIG_LEVEL &&
+		    !(val & GICH_LR_PENDING_BIT)) {
+			irq->soft_pending = false;
+			irq->pending = irq->line_level;
+		}
+
+		spin_unlock(&irq->irq_lock);
+	}
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ *   it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+	u32 val = irq->intid;
+
+	if (irq->pending) {
+		val |= GICH_LR_PENDING_BIT;
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			irq->pending = false;
+
+		if (vgic_irq_is_sgi(irq->intid)) {
+			u32 src = ffs(irq->source);
+
+			BUG_ON(!src);
+			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+			irq->source &= ~(1 << (src - 1));
+			if (irq->source)
+				irq->pending = true;
+		}
+	}
+
+	if (irq->active)
+		val |= GICH_LR_ACTIVE_BIT;
+
+	if (irq->hw) {
+		val |= GICH_LR_HW;
+		val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+	} else {
+		if (irq->config == VGIC_CONFIG_LEVEL)
+			val |= GICH_LR_EOI;
+	}
+
+	/* The GICv2 LR only holds five bits of priority. */
+	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 08a862a98442..44d2533ac84e 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -400,10 +400,12 @@ retry:
 
 static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
 {
+	vgic_v2_process_maintenance(vcpu);
 }
 
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
 {
+	vgic_v2_fold_lr_state(vcpu);
 }
 
 /* Requires the irq_lock to be held. */
@@ -411,14 +413,18 @@ static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
 				    struct vgic_irq *irq, int lr)
 {
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+	vgic_v2_populate_lr(vcpu, irq, lr);
 }
 
 static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
 {
+	vgic_v2_clear_lr(vcpu, lr);
 }
 
 static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
 {
+	vgic_v2_set_underflow(vcpu);
 }
 
 /* Requires the ap_list_lock to be held. */
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 29b96b96a30b..0db490e491ef 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -22,4 +22,10 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+
 #endif
-- 
cgit v1.2.3


From 59529f69f5048e50dcde3434661981c01f8208b4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 30 Nov 2015 13:09:53 +0000
Subject: KVM: arm/arm64: vgic-new: Add GICv3 world switch backend

As the GICv3 virtual interface registers differ from their GICv2
siblings, we need different handlers for processing maintenance
interrupts and reading/writing to the LRs.
Implement the respective handler functions and connect them to
existing code to be called if the host is using a GICv3.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/irqchip/arm-gic-v3.h |   1 +
 virt/kvm/arm/vgic/vgic-v3.c        | 162 +++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.c           |  25 ++++--
 virt/kvm/arm/vgic/vgic.h           |  29 +++++++
 4 files changed, 212 insertions(+), 5 deletions(-)
 create mode 100644 virt/kvm/arm/vgic/vgic-v3.c

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index ec938d14da5d..35e93cfa1742 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -275,6 +275,7 @@
 #define ICH_LR_ACTIVE_BIT		(1ULL << 63)
 #define ICH_LR_PHYS_ID_SHIFT		32
 #define ICH_LR_PHYS_ID_MASK		(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
+#define ICH_LR_PRIORITY_SHIFT		48
 
 /* These are for GICv2 emulation only */
 #define GICH_LR_VIRTUALID		(0x3ffUL << 0)
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
new file mode 100644
index 000000000000..fb547da7a43d
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -0,0 +1,162 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include "vgic.h"
+
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+
+	if (cpuif->vgic_misr & ICH_MISR_EOI) {
+		unsigned long eisr_bmap = cpuif->vgic_eisr;
+		int lr;
+
+		for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) {
+			u32 intid;
+			u64 val = cpuif->vgic_lr[lr];
+
+			if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+				intid = val & ICH_LR_VIRTUAL_ID_MASK;
+			else
+				intid = val & GICH_LR_VIRTUALID;
+
+			WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE);
+
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+		}
+
+		/*
+		 * In the next iterations of the vcpu loop, if we sync
+		 * the vgic state after flushing it, but before
+		 * entering the guest (this happens for pending
+		 * signals and vmid rollovers), then make sure we
+		 * don't pick up any old maintenance interrupts here.
+		 */
+		cpuif->vgic_eisr = 0;
+	}
+
+	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+}
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	int lr;
+
+	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+		u64 val = cpuif->vgic_lr[lr];
+		u32 intid;
+		struct vgic_irq *irq;
+
+		if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+			intid = val & ICH_LR_VIRTUAL_ID_MASK;
+		else
+			intid = val & GICH_LR_VIRTUALID;
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+		spin_lock(&irq->irq_lock);
+
+		/* Always preserve the active bit */
+		irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+		/* Edge is the only case where we preserve the pending bit */
+		if (irq->config == VGIC_CONFIG_EDGE &&
+		    (val & ICH_LR_PENDING_BIT)) {
+			irq->pending = true;
+
+			if (vgic_irq_is_sgi(intid) &&
+			    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+				u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+				cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+				irq->source |= (1 << cpuid);
+			}
+		}
+
+		/* Clear soft pending state when level irqs have been acked */
+		if (irq->config == VGIC_CONFIG_LEVEL &&
+		    !(val & ICH_LR_PENDING_BIT)) {
+			irq->soft_pending = false;
+			irq->pending = irq->line_level;
+		}
+
+		spin_unlock(&irq->irq_lock);
+	}
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	u64 val = irq->intid;
+
+	if (irq->pending) {
+		val |= ICH_LR_PENDING_BIT;
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			irq->pending = false;
+
+		if (vgic_irq_is_sgi(irq->intid) &&
+		    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+			u32 src = ffs(irq->source);
+
+			BUG_ON(!src);
+			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+			irq->source &= ~(1 << (src - 1));
+			if (irq->source)
+				irq->pending = true;
+		}
+	}
+
+	if (irq->active)
+		val |= ICH_LR_ACTIVE_BIT;
+
+	if (irq->hw) {
+		val |= ICH_LR_HW;
+		val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+	} else {
+		if (irq->config == VGIC_CONFIG_LEVEL)
+			val |= ICH_LR_EOI;
+	}
+
+	/*
+	 * We currently only support Group1 interrupts, which is a
+	 * known defect. This needs to be addressed at some point.
+	 */
+	if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+		val |= ICH_LR_GROUP;
+
+	val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 44d2533ac84e..0bf0d2060053 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -400,12 +400,18 @@ retry:
 
 static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
 {
-	vgic_v2_process_maintenance(vcpu);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_process_maintenance(vcpu);
+	else
+		vgic_v3_process_maintenance(vcpu);
 }
 
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
 {
-	vgic_v2_fold_lr_state(vcpu);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_fold_lr_state(vcpu);
+	else
+		vgic_v3_fold_lr_state(vcpu);
 }
 
 /* Requires the irq_lock to be held. */
@@ -414,17 +420,26 @@ static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
 {
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
 
-	vgic_v2_populate_lr(vcpu, irq, lr);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_populate_lr(vcpu, irq, lr);
+	else
+		vgic_v3_populate_lr(vcpu, irq, lr);
 }
 
 static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
 {
-	vgic_v2_clear_lr(vcpu, lr);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_clear_lr(vcpu, lr);
+	else
+		vgic_v3_clear_lr(vcpu, lr);
 }
 
 static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
 {
-	vgic_v2_set_underflow(vcpu);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_set_underflow(vcpu);
+	else
+		vgic_v3_set_underflow(vcpu);
 }
 
 /* Requires the ap_list_lock to be held. */
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 0db490e491ef..81b1a20dfbc5 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -28,4 +28,33 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+#else
+static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
+				       struct vgic_irq *irq, int lr)
+{
+}
+
+static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+}
+
+static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 2b0cda8789654bfcebca397daebc37aff081bd75 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 26 Apr 2016 11:06:47 +0100
Subject: KVM: arm/arm64: vgic-new: Add CTLR, TYPER and IIDR handlers

Those three registers are v2 emulation specific, so their implementation
lives entirely in vgic-mmio-v2.c. Also they are handled in one function,
as their implementation is pretty simple.
When the guest enables the distributor, we kick all VCPUs to get
potentially pending interrupts serviced.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/irqchip/arm-gic.h  |  1 +
 virt/kvm/arm/vgic/vgic-mmio-v2.c | 46 +++++++++++++++++++++++++++++++++++++++-
 virt/kvm/arm/vgic/vgic.c         | 15 +++++++++++++
 virt/kvm/arm/vgic/vgic.h         |  4 ++++
 4 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index be0d26f940af..fd051855539b 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -33,6 +33,7 @@
 
 #define GIC_DIST_CTRL			0x000
 #define GIC_DIST_CTR			0x004
+#define GIC_DIST_IIDR			0x008
 #define GIC_DIST_IGROUP			0x080
 #define GIC_DIST_ENABLE_SET		0x100
 #define GIC_DIST_ENABLE_CLEAR		0x180
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a3e31a93a3fb..d812c933708a 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -20,9 +20,53 @@
 #include "vgic.h"
 #include "vgic-mmio.h"
 
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len)
+{
+	u32 value;
+
+	switch (addr & 0x0c) {
+	case GIC_DIST_CTRL:
+		value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0;
+		break;
+	case GIC_DIST_CTR:
+		value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+		value = (value >> 5) - 1;
+		value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+		break;
+	case GIC_DIST_IIDR:
+		value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+		break;
+	default:
+		return 0;
+	}
+
+	return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	bool was_enabled = dist->enabled;
+
+	switch (addr & 0x0c) {
+	case GIC_DIST_CTRL:
+		dist->enabled = val & GICD_ENABLE;
+		if (!was_enabled && dist->enabled)
+			vgic_kick_vcpus(vcpu->kvm);
+		break;
+	case GIC_DIST_CTR:
+	case GIC_DIST_IIDR:
+		/* Nothing to do */
+		return;
+	}
+}
+
 static const struct vgic_register_region vgic_v2_dist_registers[] = {
 	REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 12,
+		vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
 		vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 7e010087224c..12ae84b4931f 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -545,3 +545,18 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 
 	return pending;
 }
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int c;
+
+	/*
+	 * We've injected an interrupt, time to find out who deserves
+	 * a good kick...
+	 */
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		if (kvm_vgic_vcpu_pending_irq(vcpu))
+			kvm_vcpu_kick(vcpu);
+	}
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index fd9acaa1e305..cf620157e1e4 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -16,11 +16,15 @@
 #ifndef __KVM_ARM_VGIC_NEW_H__
 #define __KVM_ARM_VGIC_NEW_H__
 
+#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */
+#define IMPLEMENTER_ARM		0x43b
+
 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
 
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_kick_vcpus(struct kvm *kvm);
 
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From a118084432d642eeccb961c7c8cc61525a941fcb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Fri, 20 May 2016 22:13:45 +0200
Subject: vfs: add d_real_inode() helper

Needed by the following fix.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Cc: <stable@vger.kernel.org>
---
 include/linux/dcache.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 7e9422cb5989..ad5d582f9b14 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -576,5 +576,17 @@ static inline struct inode *vfs_select_inode(struct dentry *dentry,
 	return inode;
 }
 
+/**
+ * d_real_inode - Return the real inode
+ * @dentry: The dentry to query
+ *
+ * If dentry is on an union/overlay, then return the underlying, real inode.
+ * Otherwise return d_inode().
+ */
+static inline struct inode *d_real_inode(struct dentry *dentry)
+{
+	return d_backing_inode(d_real(dentry));
+}
+
 
 #endif	/* __LINUX_DCACHE_H */
-- 
cgit v1.2.3


From 7e13318daa4a67bff2f800923a993ef3818b3c53 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 18 May 2016 09:06:10 -0700
Subject: net: define gso types for IPx over IPv4 and IPv6

This patch defines two new GSO definitions SKB_GSO_IPXIP4 and
SKB_GSO_IPXIP6 along with corresponding NETIF_F_GSO_IPXIP4 and
NETIF_F_GSO_IPXIP6. These are used to described IP in IP
tunnel and what the outer protocol is. The inner protocol
can be deduced from other GSO types (e.g. SKB_GSO_TCPV4 and
SKB_GSO_TCPV6). The GSO types of SKB_GSO_IPIP and SKB_GSO_SIT
are removed (these are both instances of SKB_GSO_IPXIP4).
SKB_GSO_IPXIP6 will be used when support for GSO with IP
encapsulation over IPv6 is added.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c  |  5 ++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  5 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c       |  3 +--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c       |  3 +--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c     |  3 +--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |  3 +--
 drivers/net/ethernet/intel/igb/igb_main.c         |  3 +--
 drivers/net/ethernet/intel/igbvf/netdev.c         |  3 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  3 +--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  3 +--
 include/linux/netdev_features.h                   | 12 ++++++------
 include/linux/netdevice.h                         |  4 ++--
 include/linux/skbuff.h                            |  4 ++--
 net/core/ethtool.c                                |  4 ++--
 net/ipv4/af_inet.c                                |  2 +-
 net/ipv4/ipip.c                                   |  2 +-
 net/ipv6/ip6_offload.c                            |  4 ++--
 net/ipv6/sit.c                                    |  4 ++--
 net/netfilter/ipvs/ip_vs_xmit.c                   | 17 +++++++----------
 19 files changed, 37 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index d465bd721146..0a5b770cefaa 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13259,12 +13259,11 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev,
 		NETIF_F_RXHASH | NETIF_F_HW_VLAN_CTAG_TX;
 	if (!chip_is_e1x) {
 		dev->hw_features |= NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL |
-				    NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT;
+				    NETIF_F_GSO_IPXIP4;
 		dev->hw_enc_features =
 			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
 			NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 |
-			NETIF_F_GSO_IPIP |
-			NETIF_F_GSO_SIT |
+			NETIF_F_GSO_IPXIP4 |
 			NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL;
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5a0dca3e6ef6..72a2efff8e49 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6311,7 +6311,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
 			   NETIF_F_TSO | NETIF_F_TSO6 |
 			   NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
-			   NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT |
+			   NETIF_F_GSO_IPXIP4 |
 			   NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
 			   NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
 			   NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_GRO;
@@ -6321,8 +6321,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			NETIF_F_TSO | NETIF_F_TSO6 |
 			NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
 			NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
-			NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT |
-			NETIF_F_GSO_PARTIAL;
+			NETIF_F_GSO_IPXIP4 | NETIF_F_GSO_PARTIAL;
 	dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM |
 				    NETIF_F_GSO_GRE_CSUM;
 	dev->vlan_features = dev->hw_features | NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1cd0ebf7520a..242a1ff344e6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9083,8 +9083,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 				   NETIF_F_TSO6			|
 				   NETIF_F_GSO_GRE		|
 				   NETIF_F_GSO_GRE_CSUM		|
-				   NETIF_F_GSO_IPIP		|
-				   NETIF_F_GSO_SIT		|
+				   NETIF_F_GSO_IPXIP4		|
 				   NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_UDP_TUNNEL_CSUM	|
 				   NETIF_F_GSO_PARTIAL		|
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 99a524db5560..0a8122c00ae2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2284,8 +2284,7 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
 
 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
 					 SKB_GSO_GRE_CSUM |
-					 SKB_GSO_IPIP |
-					 SKB_GSO_SIT |
+					 SKB_GSO_IPXIP4 |
 					 SKB_GSO_UDP_TUNNEL |
 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index fd7dae46c5d8..2bbbbd0f9f15 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1559,8 +1559,7 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
 
 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
 					 SKB_GSO_GRE_CSUM |
-					 SKB_GSO_IPIP |
-					 SKB_GSO_SIT |
+					 SKB_GSO_IPXIP4 |
 					 SKB_GSO_UDP_TUNNEL |
 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 642bb45ed906..02d0a1ca6960 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2230,8 +2230,7 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 				   NETIF_F_TSO6			|
 				   NETIF_F_GSO_GRE		|
 				   NETIF_F_GSO_GRE_CSUM		|
-				   NETIF_F_GSO_IPIP		|
-				   NETIF_F_GSO_SIT		|
+				   NETIF_F_GSO_IPXIP4		|
 				   NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_UDP_TUNNEL_CSUM	|
 				   NETIF_F_GSO_PARTIAL		|
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 21727692bef6..b1a5cdb77088 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2418,8 +2418,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				  NETIF_F_GSO_GRE_CSUM | \
-				  NETIF_F_GSO_IPIP | \
-				  NETIF_F_GSO_SIT | \
+				  NETIF_F_GSO_IPXIP4 | \
 				  NETIF_F_GSO_UDP_TUNNEL | \
 				  NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 322a2d7828a5..79b907f1a520 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2763,8 +2763,7 @@ static int igbvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #define IGBVF_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				    NETIF_F_GSO_GRE_CSUM | \
-				    NETIF_F_GSO_IPIP | \
-				    NETIF_F_GSO_SIT | \
+				    NETIF_F_GSO_IPXIP4 | \
 				    NETIF_F_GSO_UDP_TUNNEL | \
 				    NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 9f3677c7e96f..69452c379cbc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9482,8 +9482,7 @@ skip_sriov:
 
 #define IXGBE_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				    NETIF_F_GSO_GRE_CSUM | \
-				    NETIF_F_GSO_IPIP | \
-				    NETIF_F_GSO_SIT | \
+				    NETIF_F_GSO_IPXIP4 | \
 				    NETIF_F_GSO_UDP_TUNNEL | \
 				    NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 5e348b125090..d86e51116384 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4062,8 +4062,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #define IXGBEVF_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				      NETIF_F_GSO_GRE_CSUM | \
-				      NETIF_F_GSO_IPIP | \
-				      NETIF_F_GSO_SIT | \
+				      NETIF_F_GSO_IPXIP4 | \
 				      NETIF_F_GSO_UDP_TUNNEL | \
 				      NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index bc8736266749..aa7b2400f98c 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -44,8 +44,8 @@ enum {
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
 	NETIF_F_GSO_GRE_CSUM_BIT,	/* ... GRE with csum with TSO */
-	NETIF_F_GSO_IPIP_BIT,		/* ... IPIP tunnel with TSO */
-	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
+	NETIF_F_GSO_IPXIP4_BIT,		/* ... IP4 or IP6 over IP4 with TSO */
+	NETIF_F_GSO_IPXIP6_BIT,		/* ... IP4 or IP6 over IP6 with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
 	NETIF_F_GSO_PARTIAL_BIT,	/* ... Only segment inner-most L4
@@ -121,8 +121,8 @@ enum {
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
 #define NETIF_F_GSO_GRE_CSUM	__NETIF_F(GSO_GRE_CSUM)
-#define NETIF_F_GSO_IPIP	__NETIF_F(GSO_IPIP)
-#define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
+#define NETIF_F_GSO_IPXIP4	__NETIF_F(GSO_IPXIP4)
+#define NETIF_F_GSO_IPXIP6	__NETIF_F(GSO_IPXIP6)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
@@ -200,8 +200,8 @@ enum {
 
 #define NETIF_F_GSO_ENCAP_ALL	(NETIF_F_GSO_GRE |			\
 				 NETIF_F_GSO_GRE_CSUM |			\
-				 NETIF_F_GSO_IPIP |			\
-				 NETIF_F_GSO_SIT |			\
+				 NETIF_F_GSO_IPXIP4 |			\
+				 NETIF_F_GSO_IPXIP6 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c148edfe4965..f45929ce8157 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4006,8 +4006,8 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_IPIP    != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c413c588a24f..65968a97517f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -471,9 +471,9 @@ enum {
 
 	SKB_GSO_GRE_CSUM = 1 << 8,
 
-	SKB_GSO_IPIP = 1 << 9,
+	SKB_GSO_IPXIP4 = 1 << 9,
 
-	SKB_GSO_SIT = 1 << 10,
+	SKB_GSO_IPXIP6 = 1 << 10,
 
 	SKB_GSO_UDP_TUNNEL = 1 << 11,
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index bdb4013581b1..f4034817d255 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -84,8 +84,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
 	[NETIF_F_GSO_GRE_CSUM_BIT] =	 "tx-gre-csum-segmentation",
-	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
-	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
+	[NETIF_F_GSO_IPXIP4_BIT] =	 "tx-ipxip4-segmentation",
+	[NETIF_F_GSO_IPXIP6_BIT] =	 "tx-ipxip6-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7f08d4525981..25040b183a60 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1483,7 +1483,7 @@ out_unlock:
 static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	skb->encapsulation = 1;
-	skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
 	return inet_gro_complete(skb, nhoff);
 }
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 92827483ee3d..978370132f29 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 9ad743b2c624..787e55f4796c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -86,7 +86,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
 
 	if (skb->encapsulation &&
-	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
 		udpfrag = proto == IPPROTO_UDP && encap;
 	else
 		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
@@ -294,7 +294,7 @@ out_unlock:
 static int sit_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	skb->encapsulation = 1;
-	skb_shinfo(skb)->gso_type |= SKB_GSO_SIT;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
 	return ipv6_gro_complete(skb, nhoff);
 }
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a13d8c114ccb..0a5a255277e5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -913,7 +913,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		goto tx_error;
 	}
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_SIT)) {
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
@@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6d19d2eeaa60..01d3d894de46 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -932,17 +932,14 @@ error:
 
 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
 {
-	if (encaps_af == AF_INET) {
-		if (orig_af == AF_INET)
-			return SKB_GSO_IPIP;
-
-		return SKB_GSO_SIT;
+	switch (encaps_af) {
+	case AF_INET:
+		return SKB_GSO_IPXIP4;
+	case AF_INET6:
+		return SKB_GSO_IPXIP6;
+	default:
+		return 0;
 	}
-
-	/* GSO: we need to provide proper SKB_GSO_ value for IPv6:
-	 * SKB_GSO_SIT/IPV6
-	 */
-	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 95829b3a9c0b1d88778b23bc2afdf5a83de066ff Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 19 May 2016 11:30:54 -0400
Subject: net: suppress warnings on dev_alloc_skb

Noticed an allocation failure in a network driver the other day on a 32 bit
system:

DMA-API: debugging out of memory - disabling
bnx2fc: adapter_lookup: hba NULL
lldpad: page allocation failure. order:0, mode:0x4120
Pid: 4556, comm: lldpad Not tainted 2.6.32-639.el6.i686.debug #1
Call Trace:
 [<c08a4086>] ? printk+0x19/0x23
 [<c05166a4>] ? __alloc_pages_nodemask+0x664/0x830
 [<c0649d02>] ? free_object+0x82/0xa0
 [<fb4e2c9b>] ? ixgbe_alloc_rx_buffers+0x10b/0x1d0 [ixgbe]
 [<fb4e2fff>] ? ixgbe_configure_rx_ring+0x29f/0x420 [ixgbe]
 [<fb4e228c>] ? ixgbe_configure_tx_ring+0x15c/0x220 [ixgbe]
 [<fb4e3709>] ? ixgbe_configure+0x589/0xc00 [ixgbe]
 [<fb4e7be7>] ? ixgbe_open+0xa7/0x5c0 [ixgbe]
 [<fb503ce6>] ? ixgbe_init_interrupt_scheme+0x5b6/0x970 [ixgbe]
 [<fb4e8e54>] ? ixgbe_setup_tc+0x1a4/0x260 [ixgbe]
 [<fb505a9f>] ? ixgbe_dcbnl_set_state+0x7f/0x90 [ixgbe]
 [<c088d80d>] ? dcb_doit+0x10ed/0x16d0
...

Thought that perhaps the big splat in the logs wasn't really necessecary, as
all call sites for dev_alloc_skb:

a) check the return code for the function

and

b) either print their own error message or have a recovery path that makes the
warning moot.

Fix it by modifying dev_alloc_pages to pass __GFP_NOWARN as a gfp flag to
suppress the warning

applies to the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 65968a97517f..ee38a4127475 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2467,7 +2467,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 
 static inline struct page *dev_alloc_pages(unsigned int order)
 {
-	return __dev_alloc_pages(GFP_ATOMIC, order);
+	return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
 }
 
 /**
@@ -2485,7 +2485,7 @@ static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
 
 static inline struct page *dev_alloc_page(void)
 {
-	return __dev_alloc_page(GFP_ATOMIC);
+	return dev_alloc_pages(0);
 }
 
 /**
-- 
cgit v1.2.3


From 59dc76b0d4dfdd7dc46a1010e4afb44f60f3e97f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 20 May 2016 16:56:31 -0700
Subject: mm: vmscan: reduce size of inactive file list

The inactive file list should still be large enough to contain readahead
windows and freshly written file data, but it no longer is the only
source for detecting multiple accesses to file pages.  The workingset
refault measurement code causes recently evicted file pages that get
accessed again after a shorter interval to be promoted directly to the
active list.

With that mechanism in place, we can afford to (on a larger system)
dedicate more memory to the active file list, so we can actually cache
more of the frequently used file pages in memory, and not have them
pushed out by streaming writes, once-used streaming file reads, etc.

This can help things like database workloads, where only half the page
cache can currently be used to cache the database working set.  This
patch automatically increases that fraction on larger systems, using the
same ratio that has already been used for anonymous memory.

[hannes@cmpxchg.org: cgroup-awareness]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  25 -----------
 mm/page_alloc.c            |  44 -------------------
 mm/vmscan.c                | 104 ++++++++++++++++++---------------------------
 3 files changed, 42 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 94da96738df3..a805474df4ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return mz->lru_size[lru];
 }
 
-static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
-	unsigned long gb;
-
-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
-
-	return inactive * inactive_ratio < active;
-}
-
 void mem_cgroup_handle_over_high(void);
 
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -646,12 +627,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return true;
 }
 
-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return true;
-}
-
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c469c1dfb8b..edbdf56b3c9b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6670,49 +6670,6 @@ void setup_per_zone_wmarks(void)
 	mutex_unlock(&zonelists_mutex);
 }
 
-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total     target    max
- * memory    ratio     inactive anon
- * -------------------------------------
- *   10MB       1         5MB
- *  100MB       1        50MB
- *    1GB       3       250MB
- *   10GB      10       0.9GB
- *  100GB      31         3GB
- *    1TB     101        10GB
- *   10TB     320        32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
-	unsigned int gb, ratio;
-
-	/* Zone size in gigabytes */
-	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
-	if (gb)
-		ratio = int_sqrt(10 * gb);
-	else
-		ratio = 1;
-
-	zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
-	struct zone *zone;
-
-	for_each_zone(zone)
-		calculate_zone_inactive_ratio(zone);
-}
-
 /*
  * Initialise min_free_kbytes.
  *
@@ -6758,7 +6715,6 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
-	setup_per_zone_inactive_ratio();
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcfdfc1a0942..38d6d06c955f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1862,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	free_hot_cold_page_list(&l_hold, true);
 }
 
-#ifdef CONFIG_SWAP
-static bool inactive_anon_is_low_global(struct zone *zone)
-{
-	unsigned long active, inactive;
-
-	active = zone_page_state(zone, NR_ACTIVE_ANON);
-	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
-	return inactive * zone->inactive_ratio < active;
-}
-
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
  *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static bool inactive_anon_is_low(struct lruvec *lruvec)
-{
-	/*
-	 * If we don't have swap space, anonymous page deactivation
-	 * is pointless.
-	 */
-	if (!total_swap_pages)
-		return false;
-
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_inactive_anon_is_low(lruvec);
-
-	return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline bool inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return false;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
  *
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
  *
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
  *
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
+ * total     target    max
+ * memory    ratio     inactive
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
  */
-static bool inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
 {
+	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
+	unsigned long gb;
 
-	inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
-	active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	/*
+	 * If we don't have swap space, anonymous page deactivation
+	 * is pointless.
+	 */
+	if (!file && !total_swap_pages)
+		return false;
 
-	return active > inactive;
-}
+	inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+	active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
-static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
-	if (is_file_lru(lru))
-		return inactive_file_is_low(lruvec);
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
 	else
-		return inactive_anon_is_low(lruvec);
+		inactive_ratio = 1;
+
+	return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, lru))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru)))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2059,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * lruvec even if it has plenty of old anonymous pages unless the
 	 * system is under heavy pressure.
 	 */
-	if (!inactive_file_is_low(lruvec) &&
+	if (!inactive_list_is_low(lruvec, true) &&
 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2301,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(lruvec))
+	if (inactive_list_is_low(lruvec, false))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 
@@ -2962,7 +2942,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-		if (inactive_anon_is_low(lruvec))
+		if (inactive_list_is_low(lruvec, false))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
-- 
cgit v1.2.3


From ea7ab982b6bdb7ce218fd3a7850bb2e2b414fdd0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:38 -0700
Subject: mm, compaction: change COMPACT_ constants into enum

Compaction code is doing weird dances between COMPACT_FOO -> int ->
unsigned long

But there doesn't seem to be any reason for that.  All functions which
return/use one of those constants are not expecting any other value so it
really makes sense to define an enum for them and make it clear that no
other values are expected.

This is a pure cleanup and shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 45 +++++++++++++++++++++++++++------------------
 mm/compaction.c            | 27 ++++++++++++++-------------
 mm/page_alloc.c            |  2 +-
 3 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 242b660f64e6..706cbf00e919 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,21 +2,29 @@
 #define _LINUX_COMPACTION_H
 
 /* Return values for compact_zone() and try_to_compact_pages() */
-/* compaction didn't start as it was deferred due to past failures */
-#define COMPACT_DEFERRED	0
-/* compaction didn't start as it was not possible or direct reclaim was more suitable */
-#define COMPACT_SKIPPED		1
-/* compaction should continue to another pageblock */
-#define COMPACT_CONTINUE	2
-/* direct compaction partially compacted a zone and there are suitable pages */
-#define COMPACT_PARTIAL		3
-/* The full zone was compacted */
-#define COMPACT_COMPLETE	4
-/* For more detailed tracepoint output */
-#define COMPACT_NO_SUITABLE_PAGE	5
-#define COMPACT_NOT_SUITABLE_ZONE	6
-#define COMPACT_CONTENDED		7
 /* When adding new states, please adjust include/trace/events/compaction.h */
+enum compact_result {
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/*
+	 * compaction didn't start as it was not possible or direct reclaim
+	 * was more suitable
+	 */
+	COMPACT_SKIPPED,
+	/* compaction should continue to another pageblock */
+	COMPACT_CONTINUE,
+	/*
+	 * direct compaction partially compacted a zone and there are suitable
+	 * pages
+	 */
+	COMPACT_PARTIAL,
+	/* The full zone was compacted */
+	COMPACT_COMPLETE,
+	/* For more detailed tracepoint output */
+	COMPACT_NO_SUITABLE_PAGE,
+	COMPACT_NOT_SUITABLE_ZONE,
+	COMPACT_CONTENDED,
+};
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
@@ -38,12 +46,13 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
+			unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
-extern unsigned long compaction_suitable(struct zone *zone, int order,
+extern enum compact_result compaction_suitable(struct zone *zone, int order,
 		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
@@ -57,7 +66,7 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
-static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
 			enum migrate_mode mode, int *contended)
@@ -73,7 +82,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 {
 }
 
-static inline unsigned long compaction_suitable(struct zone *zone, int order,
+static inline enum compact_result compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx)
 {
 	return COMPACT_SKIPPED;
diff --git a/mm/compaction.c b/mm/compaction.c
index eda3c2244f30..e721d252c5d2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
 	unsigned int order;
@@ -1292,8 +1292,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 	return COMPACT_NO_SUITABLE_PAGE;
 }
 
-static int compact_finished(struct zone *zone, struct compact_control *cc,
-			    const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+			struct compact_control *cc,
+			const int migratetype)
 {
 	int ret;
 
@@ -1312,7 +1313,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
  *   COMPACT_CONTINUE - If compaction should run now
  */
-static unsigned long __compaction_suitable(struct zone *zone, int order,
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
@@ -1358,11 +1359,11 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 	return COMPACT_CONTINUE;
 }
 
-unsigned long compaction_suitable(struct zone *zone, int order,
+enum compact_result compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
 	trace_mm_compaction_suitable(zone, order, ret);
@@ -1372,9 +1373,9 @@ unsigned long compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
-	int ret;
+	enum compact_result ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
 	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1530,11 +1531,11 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
 		unsigned int alloc_flags, int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
@@ -1572,7 +1573,7 @@ int sysctl_extfrag_threshold = 500;
  *
  * This is the main entry point for direct page compaction.
  */
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended)
 {
@@ -1580,7 +1581,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	int rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_DEFERRED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1594,7 +1595,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
-		int status;
+		enum compact_result status;
 		int zone_contended;
 
 		if (compaction_deferred(zone, order))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index edbdf56b3c9b..ed62c4b90598 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3188,7 +3188,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
-	unsigned long compact_result;
+	enum compact_result compact_result;
 	struct page *page;
 
 	if (!order)
-- 
cgit v1.2.3


From 1d4746d395975e0ff5103e20ab169d1a95b4ef9e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:44 -0700
Subject: mm, compaction: distinguish COMPACT_DEFERRED from COMPACT_SKIPPED

try_to_compact_pages() can currently return COMPACT_SKIPPED even when
the compaction is defered for some zone just because zone DMA is skipped
in 99% of cases due to watermark checks.  This makes COMPACT_DEFERRED
basically unusable for the page allocator as a feedback mechanism.

Make sure we distinguish those two states properly and switch their
ordering in the enum.  This would mean that the COMPACT_SKIPPED will be
returned only when all eligible zones are skipped.

As a result COMPACT_DEFERRED handling for THP in __alloc_pages_slowpath
will be more precise and we would bail out rather than reclaim.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 7 +++++--
 include/trace/events/compaction.h | 2 +-
 mm/compaction.c                   | 8 +++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 706cbf00e919..11f228712ed5 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,13 +4,16 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
-	/* compaction didn't start as it was deferred due to past failures */
-	COMPACT_DEFERRED,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
 	 */
 	COMPACT_SKIPPED,
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/* compaction not active last round */
+	COMPACT_INACTIVE = COMPACT_DEFERRED,
+
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
 	/*
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e215bf68f521..6ba16c86d7db 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -10,8 +10,8 @@
 #include <trace/events/mmflags.h>
 
 #define COMPACTION_STATUS					\
-	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_SKIPPED,		"skipped")		\
+	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
 	EM( COMPACT_COMPLETE,		"complete")		\
diff --git a/mm/compaction.c b/mm/compaction.c
index 455ecd87f48d..b2b94474dd28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1578,7 +1578,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	enum compact_result rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_SKIPPED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1595,8 +1595,10 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		enum compact_result status;
 		int zone_contended;
 
-		if (compaction_deferred(zone, order))
+		if (compaction_deferred(zone, order)) {
+			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
 			continue;
+		}
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
 				&zone_contended, alloc_flags,
@@ -1667,7 +1669,7 @@ break_loop:
 	 * If at least one zone wasn't deferred or skipped, we report if all
 	 * zones that were tried were lock contended.
 	 */
-	if (rc > COMPACT_SKIPPED && all_zones_contended)
+	if (rc > COMPACT_INACTIVE && all_zones_contended)
 		*contended = COMPACT_CONTENDED_LOCK;
 
 	return rc;
-- 
cgit v1.2.3


From c8f7de0bfae36e8532e5e25a39d15407f02aca78 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:47 -0700
Subject: mm, compaction: distinguish between full and partial COMPACT_COMPLETE

COMPACT_COMPLETE now means that compaction and free scanner met.  This
is not very useful information if somebody just wants to use this
feedback and make any decisions based on that.  The current caller might
be a poor guy who just happened to scan tiny portion of the zone and
that could be the reason no suitable pages were compacted.  Make sure we
distinguish the full and partial zone walks.

Consumers should treat COMPACT_PARTIAL_SKIPPED as a potential success
and be optimistic in retrying.

The existing users of COMPACT_COMPLETE are conservatively changed to use
COMPACT_PARTIAL_SKIPPED as well but some of them should be probably
reconsidered and only defer the compaction only for COMPACT_COMPLETE
with the new semantic.

This patch shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 10 +++++++++-
 include/trace/events/compaction.h |  1 +
 mm/compaction.c                   | 14 +++++++++++---
 mm/internal.h                     |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 11f228712ed5..9b37f9d3f7a8 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,7 +21,15 @@ enum compact_result {
 	 * pages
 	 */
 	COMPACT_PARTIAL,
-	/* The full zone was compacted */
+	/*
+	 * direct compaction has scanned part of the zone but wasn't successfull
+	 * to compact suitable pages.
+	 */
+	COMPACT_PARTIAL_SKIPPED,
+	/*
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
+	 */
 	COMPACT_COMPLETE,
 	/* For more detailed tracepoint output */
 	COMPACT_NO_SUITABLE_PAGE,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 6ba16c86d7db..36e2d6fb1360 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -14,6 +14,7 @@
 	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
+	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
 	EM( COMPACT_COMPLETE,		"complete")		\
 	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
 	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
diff --git a/mm/compaction.c b/mm/compaction.c
index b2b94474dd28..4af1577adb5c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1252,7 +1252,10 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 		if (cc->direct_compaction)
 			zone->compact_blockskip_flush = true;
 
-		return COMPACT_COMPLETE;
+		if (cc->whole_zone)
+			return COMPACT_COMPLETE;
+		else
+			return COMPACT_PARTIAL_SKIPPED;
 	}
 
 	if (is_via_compact_memory(cc->order))
@@ -1413,6 +1416,10 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
+
+	if (cc->migrate_pfn == start_pfn)
+		cc->whole_zone = true;
+
 	cc->last_migrated_pfn = 0;
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1634,7 +1641,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 			goto break_loop;
 		}
 
-		if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+		if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+					status == COMPACT_PARTIAL_SKIPPED)) {
 			/*
 			 * We think that allocation won't succeed in this zone
 			 * so we defer compaction there. If it ends up
@@ -1881,7 +1889,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 						cc.classzone_idx, 0)) {
 			success = true;
 			compaction_defer_reset(zone, cc.order, false);
-		} else if (status == COMPACT_COMPLETE) {
+		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
 			/*
 			 * We use sync migration mode here, so we defer like
 			 * sync direct compaction does.
diff --git a/mm/internal.h b/mm/internal.h
index 3ac544f1963f..f6f3353b0868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -174,6 +174,7 @@ struct compact_control {
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
+	bool whole_zone;		/* Whole zone has been scanned */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
-- 
cgit v1.2.3


From 4f9a358c36fcdad3ea1db263ec4d484a70ad543e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:50 -0700
Subject: mm, compaction: update compaction_result ordering

compaction_result will be used as the primary feedback channel for
compaction users.  At the same time try_to_compact_pages (and
potentially others) assume a certain ordering where a more specific
feedback takes precendence.

This gets a bit awkward when we have conflicting feedback from different
zones.  E.g one returing COMPACT_COMPLETE meaning the full zone has been
scanned without any outcome while other returns with COMPACT_PARTIAL aka
made some progress.  The caller should get COMPACT_PARTIAL because that
means that the compaction still can make some progress.  The same
applies for COMPACT_PARTIAL vs COMPACT_PARTIAL_SKIPPED.

Reorder PARTIAL to be the largest one so the larger the value is the
more progress we have done.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9b37f9d3f7a8..ff39fa0a1ede 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,6 +4,8 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
+	/* For more detailed tracepoint output - internal to compaction */
+	COMPACT_NOT_SUITABLE_ZONE,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
@@ -11,30 +13,34 @@ enum compact_result {
 	COMPACT_SKIPPED,
 	/* compaction didn't start as it was deferred due to past failures */
 	COMPACT_DEFERRED,
+
 	/* compaction not active last round */
 	COMPACT_INACTIVE = COMPACT_DEFERRED,
 
+	/* For more detailed tracepoint output - internal to compaction */
+	COMPACT_NO_SUITABLE_PAGE,
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
+
 	/*
-	 * direct compaction partially compacted a zone and there are suitable
-	 * pages
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
 	 */
-	COMPACT_PARTIAL,
+	COMPACT_COMPLETE,
 	/*
 	 * direct compaction has scanned part of the zone but wasn't successfull
 	 * to compact suitable pages.
 	 */
 	COMPACT_PARTIAL_SKIPPED,
+
+	/* compaction terminated prematurely due to lock contentions */
+	COMPACT_CONTENDED,
+
 	/*
-	 * The full zone was compacted scanned but wasn't successfull to compact
-	 * suitable pages.
+	 * direct compaction partially compacted a zone and there might be
+	 * suitable pages
 	 */
-	COMPACT_COMPLETE,
-	/* For more detailed tracepoint output */
-	COMPACT_NO_SUITABLE_PAGE,
-	COMPACT_NOT_SUITABLE_ZONE,
-	COMPACT_CONTENDED,
+	COMPACT_PARTIAL,
 };
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
-- 
cgit v1.2.3


From cab1802b5f0dddea30547a7451fda8c7e4c593f0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:56 -0700
Subject: mm, compaction: abstract compaction feedback to helpers

Compaction can provide a wild variation of feedback to the caller.  Many
of them are implementation specific and the caller of the compaction
(especially the page allocator) shouldn't be bound to specifics of the
current implementation.

This patch abstracts the feedback into three basic types:
	- compaction_made_progress - compaction was active and made some
	  progress.
	- compaction_failed - compaction failed and further attempts to
	  invoke it would most probably fail and therefore it is not
	  worth retrying
	- compaction_withdrawn - compaction wasn't invoked for an
          implementation specific reasons. In the current implementation
          it means that the compaction was deferred, contended or the
          page scanners met too early without any progress. Retrying is
          still worthwhile.

[vbabka@suse.cz: do not change thp back off behavior]
[akpm@linux-foundation.org: fix typo in comment, per Hillf]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ff39fa0a1ede..8d8c916fe67a 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -78,6 +78,70 @@ extern void compaction_defer_reset(struct zone *zone, int order,
 				bool alloc_success);
 extern bool compaction_restarting(struct zone *zone, int order);
 
+/* Compaction has made some progress and retrying makes sense */
+static inline bool compaction_made_progress(enum compact_result result)
+{
+	/*
+	 * Even though this might sound confusing this in fact tells us
+	 * that the compaction successfully isolated and migrated some
+	 * pageblocks.
+	 */
+	if (result == COMPACT_PARTIAL)
+		return true;
+
+	return false;
+}
+
+/* Compaction has failed and it doesn't make much sense to keep retrying. */
+static inline bool compaction_failed(enum compact_result result)
+{
+	/* All zones were scanned completely and still not result. */
+	if (result == COMPACT_COMPLETE)
+		return true;
+
+	return false;
+}
+
+/*
+ * Compaction  has backed off for some reason. It might be throttling or
+ * lock contention. Retrying is still worthwhile.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+	/*
+	 * Compaction backed off due to watermark checks for order-0
+	 * so the regular reclaim has to try harder and reclaim something.
+	 */
+	if (result == COMPACT_SKIPPED)
+		return true;
+
+	/*
+	 * If compaction is deferred for high-order allocations, it is
+	 * because sync compaction recently failed. If this is the case
+	 * and the caller requested a THP allocation, we do not want
+	 * to heavily disrupt the system, so we fail the allocation
+	 * instead of entering direct reclaim.
+	 */
+	if (result == COMPACT_DEFERRED)
+		return true;
+
+	/*
+	 * If compaction in async mode encounters contention or blocks higher
+	 * priority task we back off early rather than cause stalls.
+	 */
+	if (result == COMPACT_CONTENDED)
+		return true;
+
+	/*
+	 * Page scanners have met but we haven't scanned full zones so this
+	 * is a back off in fact.
+	 */
+	if (result == COMPACT_PARTIAL_SKIPPED)
+		return true;
+
+	return false;
+}
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
@@ -114,6 +178,21 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 	return true;
 }
 
+static inline bool compaction_made_progress(enum compact_result result)
+{
+	return false;
+}
+
+static inline bool compaction_failed(enum compact_result result)
+{
+	return false;
+}
+
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+	return true;
+}
+
 static inline int kcompactd_run(int nid)
 {
 	return 0;
-- 
cgit v1.2.3


From 0a0337e0d1d134465778a16f5cbea95086e8e9e0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:00 -0700
Subject: mm, oom: rework oom detection

__alloc_pages_slowpath has traditionally relied on the direct reclaim
and did_some_progress as an indicator that it makes sense to retry
allocation rather than declaring OOM.  shrink_zones had to rely on
zone_reclaimable if shrink_zone didn't make any progress to prevent from
a premature OOM killer invocation - the LRU might be full of dirty or
writeback pages and direct reclaim cannot clean those up.

zone_reclaimable allows to rescan the reclaimable lists several times
and restart if a page is freed.  This is really subtle behavior and it
might lead to a livelock when a single freed page keeps allocator
looping but the current task will not be able to allocate that single
page.  OOM killer would be more appropriate than looping without any
progress for unbounded amount of time.

This patch changes OOM detection logic and pulls it out from shrink_zone
which is too low to be appropriate for any high level decisions such as
OOM which is per zonelist property.  It is __alloc_pages_slowpath which
knows how many attempts have been done and what was the progress so far
therefore it is more appropriate to implement this logic.

The new heuristic is implemented in should_reclaim_retry helper called
from __alloc_pages_slowpath.  It tries to be more deterministic and
easier to follow.  It builds on an assumption that retrying makes sense
only if the currently reclaimable memory + free pages would allow the
current allocation request to succeed (as per __zone_watermark_ok) at
least for one zone in the usable zonelist.

This alone wouldn't be sufficient, though, because the writeback might
get stuck and reclaimable pages might be pinned for a really long time
or even depend on the current allocation context.  Therefore there is a
backoff mechanism implemented which reduces the reclaim target after
each reclaim round without any progress.  This means that we should
eventually converge to only NR_FREE_PAGES as the target and fail on the
wmark check and proceed to OOM.  The backoff is simple and linear with
1/16 of the reclaimable pages for each round without any progress.  We
are optimistic and reset counter for successful reclaim rounds.

Costly high order pages mostly preserve their semantic and those without
__GFP_REPEAT fail right away while those which have the flag set will
back off after the amount of reclaimable pages reaches equivalent of the
requested order.  The only difference is that if there was no progress
during the reclaim we rely on zone watermark check.  This is more
logical thing to do than previous 1<<order attempts which were a result
of zone_reclaimable faking the progress.

[vdavydov@virtuozzo.com: check classzone_idx for shrink_zone]
[hannes@cmpxchg.org: separate the heuristic into should_reclaim_retry]
[rientjes@google.com: use zone_page_state_snapshot for NR_FREE_PAGES]
[rientjes@google.com: shrink_zones doesn't need to return anything]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   1 +
 mm/page_alloc.c      | 100 ++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/vmscan.c          |  25 +++----------
 3 files changed, 97 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ad220359f1b0..0af2bb2028fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8bcc10616fab..fa39efc3a692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
 	return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
 }
 
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
+ * pages) and no_progress_loops (number of reclaim rounds without any progress
+ * in a row) is considered as well as the reclaimable pages on the applicable
+ * zone list (with a backoff mechanism which is a function of no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+		     struct alloc_context *ac, int alloc_flags,
+		     bool did_some_progress, unsigned long pages_reclaimed,
+		     int no_progress_loops)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure we converge to OOM if we cannot make any progress
+	 * several times in the row.
+	 */
+	if (no_progress_loops > MAX_RECLAIM_RETRIES)
+		return false;
+
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		if (pages_reclaimed >= (1<<order))
+			return false;
+
+		if (did_some_progress)
+			return true;
+	}
+
+	/*
+	 * Keep reclaiming pages while there is a chance this will lead somewhere.
+	 * If none of the target zones can satisfy our allocation request even
+	 * if all reclaimable pages are considered then we are screwed and have
+	 * to go OOM.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+
+		available = zone_reclaimable_pages(zone);
+		available -= DIV_ROUND_UP(no_progress_loops * available,
+					  MAX_RECLAIM_RETRIES);
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+		/*
+		 * Would the allocation succeed if we reclaimed the whole
+		 * available?
+		 */
+		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+				ac->high_zoneidx, alloc_flags, available)) {
+			/* Wait for some write requests to complete then retry */
+			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
@@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	enum compact_result compact_result;
+	int no_progress_loops = 0;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3525,23 +3597,35 @@ retry:
 	if (gfp_mask & __GFP_NORETRY)
 		goto noretry;
 
-	/* Keep reclaiming pages as long as there is reasonable progress */
-	pages_reclaimed += did_some_progress;
-	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
-	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
-		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
-		goto retry;
+	/*
+	 * Do not retry costly high order allocations unless they are
+	 * __GFP_REPEAT
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+		goto noretry;
+
+	if (did_some_progress) {
+		no_progress_loops = 0;
+		pages_reclaimed += did_some_progress;
+	} else {
+		no_progress_loops++;
 	}
 
+	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+				 did_some_progress > 0, pages_reclaimed,
+				 no_progress_loops))
+		goto retry;
+
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)
 		goto got_pg;
 
 	/* Retry as long as the OOM killer is making progress */
-	if (did_some_progress)
+	if (did_some_progress) {
+		no_progress_loops = 0;
 		goto retry;
+	}
 
 noretry:
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a386454c015a..c4a2f4512fca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	unsigned long nr;
 
@@ -2507,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
  */
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -2518,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	unsigned long nr_soft_scanned;
 	gfp_t orig_mask;
 	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
-	bool reclaimable = false;
 
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2583,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						&nr_soft_scanned);
 			sc->nr_reclaimed += nr_soft_reclaimed;
 			sc->nr_scanned += nr_soft_scanned;
-			if (nr_soft_reclaimed)
-				reclaimable = true;
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
-			reclaimable = true;
-
-		if (global_reclaim(sc) &&
-		    !reclaimable && zone_reclaimable(zone))
-			reclaimable = true;
+		shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 	}
 
 	/*
@@ -2601,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	 * promoted it to __GFP_HIGHMEM.
 	 */
 	sc->gfp_mask = orig_mask;
-
-	return reclaimable;
 }
 
 /*
@@ -2627,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
-	bool zones_reclaimable;
 retry:
 	delayacct_freepages_start();
 
@@ -2638,7 +2625,7 @@ retry:
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
-		zones_reclaimable = shrink_zones(zonelist, sc);
+		shrink_zones(zonelist, sc);
 
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2685,10 +2672,6 @@ retry:
 		goto retry;
 	}
 
-	/* Any of the zones still reclaimable?  Don't OOM. */
-	if (zones_reclaimable)
-		return 1;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 86a294a81f93d6f36d00ec3ff779d36d218f852d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:12 -0700
Subject: mm, oom, compaction: prevent from should_compact_retry looping for
 ever for costly orders

"mm: consider compaction feedback also for costly allocation" has
removed the upper bound for the reclaim/compaction retries based on the
number of reclaimed pages for costly orders.  While this is desirable
the patch did miss a mis interaction between reclaim, compaction and the
retry logic.  The direct reclaim tries to get zones over min watermark
while compaction backs off and returns COMPACT_SKIPPED when all zones
are below low watermark + 1<<order gap.  If we are getting really close
to OOM then __compaction_suitable can keep returning COMPACT_SKIPPED a
high order request (e.g.  hugetlb order-9) while the reclaim is not able
to release enough pages to get us over low watermark.  The reclaim is
still able to make some progress (usually trashing over few remaining
pages) so we are not able to break out from the loop.

I have seen this happening with the same test described in "mm: consider
compaction feedback also for costly allocation" on a swapless system.
The original problem got resolved by "vmscan: consider classzone_idx in
compaction_ready" but it shows how things might go wrong when we
approach the oom event horizont.

The reason why compaction requires being over low rather than min
watermark is not clear to me.  This check was there essentially since
56de7263fcf3 ("mm: compaction: direct compact when a high-order
allocation fails").  It is clearly an implementation detail though and
we shouldn't pull it into the generic retry logic while we should be
able to cope with such eventuality.  The only place in
should_compact_retry where we retry without any upper bound is for
compaction_withdrawn() case.

Introduce compaction_zonelist_suitable function which checks the given
zonelist and returns true only if there is at least one zone which would
would unblock __compaction_suitable if more memory got reclaimed.  In
this implementation it checks __compaction_suitable with NR_FREE_PAGES
plus part of the reclaimable memory as the target for the watermark
check.  The reclaimable memory is reduced linearly by the allocation
order.  The idea is that we do not want to reclaim all the remaining
memory for a single allocation request just unblock
__compaction_suitable which doesn't guarantee we will make a further
progress.

The new helper is then used if compaction_withdrawn() feedback was
provided so we do not retry if there is no outlook for a further
progress.  !costly requests shouldn't be affected much - e.g.  order-2
pages would require to have at least 64kB on the reclaimable LRUs while
order-9 would need at least 32M which should be enough to not lock up.

[vbabka@suse.cz: fix classzone_idx vs. high_zoneidx usage in compaction_zonelist_suitable]
[akpm@linux-foundation.org: fix it for Mel's mm-page_alloc-remove-field-from-alloc_context.patch]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  4 ++++
 include/linux/mmzone.h     |  3 +++
 mm/compaction.c            | 42 +++++++++++++++++++++++++++++++++++++++---
 mm/page_alloc.c            | 23 +++++++++++++----------
 4 files changed, 59 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 8d8c916fe67a..a58c852a268f 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -142,6 +142,10 @@ static inline bool compaction_withdrawn(enum compact_result result)
 	return false;
 }
 
+
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+					int alloc_flags);
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60db2096fd8..8dd0333b01dc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -739,6 +739,9 @@ static inline bool is_dev_zone(const struct zone *zone)
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx,
 		unsigned int alloc_flags);
diff --git a/mm/compaction.c b/mm/compaction.c
index 4af1577adb5c..d8a20fcf8678 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1318,7 +1318,8 @@ static enum compact_result compact_finished(struct zone *zone,
  */
 static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
-					int classzone_idx)
+					int classzone_idx,
+					unsigned long wmark_target)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1341,7 +1342,8 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 	 * allocated and for a short time, the footprint is higher
 	 */
 	watermark += (2UL << order);
-	if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+				 alloc_flags, wmark_target))
 		return COMPACT_SKIPPED;
 
 	/*
@@ -1368,7 +1370,8 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 {
 	enum compact_result ret;
 
-	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+				    zone_page_state(zone, NR_FREE_PAGES));
 	trace_mm_compaction_suitable(zone, order, ret);
 	if (ret == COMPACT_NOT_SUITABLE_ZONE)
 		ret = COMPACT_SKIPPED;
@@ -1376,6 +1379,39 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+		int alloc_flags)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure at least one zone would pass __compaction_suitable if we continue
+	 * retrying the reclaim.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+		enum compact_result compact_result;
+
+		/*
+		 * Do not consider all the reclaimable memory because we do not
+		 * want to trash just for a single high order allocation which
+		 * is even not guaranteed to appear even if __compaction_suitable
+		 * is happy about the watermark check.
+		 */
+		available = zone_reclaimable_pages(zone) / order;
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+		compact_result = __compaction_suitable(zone, order, alloc_flags,
+				ac_classzone_idx(ac), available);
+		if (compact_result != COMPACT_SKIPPED &&
+				compact_result != COMPACT_NOT_SUITABLE_ZONE)
+			return true;
+	}
+
+	return false;
+}
+
 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	enum compact_result ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dea406a62e3d..089f760ce64a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2750,10 +2750,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * one free page of a suitable size. Checking now avoids taking the zone lock
  * to check in the allocation paths if no pages are free.
  */
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx,
-			unsigned int alloc_flags,
-			long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages)
 {
 	long min = mark;
 	int o;
@@ -3256,8 +3255,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
-		     enum migrate_mode *migrate_mode,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+		     enum compact_result compact_result, enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
@@ -3281,9 +3280,11 @@ should_compact_retry(unsigned int order, enum compact_result compact_result,
 	/*
 	 * make sure the compaction wasn't deferred or didn't bail out early
 	 * due to locks contention before we declare that we should give up.
+	 * But do not retry if the given zonelist is not suitable for
+	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result))
-		return true;
+		return compaction_zonelist_suitable(ac, order, alloc_flags);
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3311,7 +3312,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+		     enum compact_result compact_result,
 		     enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
@@ -3706,8 +3708,9 @@ retry:
 	 * of free memory (see __compaction_suitable)
 	 */
 	if (did_some_progress > 0 &&
-			should_compact_retry(order, compact_result,
-				&migration_mode, compaction_retries))
+			should_compact_retry(ac, order, alloc_flags,
+				compact_result, &migration_mode,
+				compaction_retries))
 		goto retry;
 
 	/* Reclaim has failed us, start killing things */
-- 
cgit v1.2.3


From bb8a4b7fd1266ef888b3a80aa5f266062b224ef4 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:18 -0700
Subject: mm, oom_reaper: hide oom reaped tasks from OOM killer more carefully

Commit 36324a990cf5 ("oom: clear TIF_MEMDIE after oom_reaper managed to
unmap the address space") not only clears TIF_MEMDIE for oom reaped task
but also set OOM_SCORE_ADJ_MIN for the target task to hide it from the
oom killer.  This works in simple cases but it is not sufficient for
(unlikely) cases where the mm is shared between independent processes
(as they do not share signal struct).  If the mm had only small amount
of memory which could be reaped then another task sharing the mm could
be selected and that wouldn't help to move out from the oom situation.

Introduce MMF_OOM_REAPED mm flag which is checked in oom_badness (same
as OOM_SCORE_ADJ_MIN) and task is skipped if the flag is set.  Set the
flag after __oom_reap_task is done with a task.  This will force the
select_bad_process() to ignore all already oom reaped tasks as well as
no such task is sacrificed for its parent.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 +
 mm/oom_kill.c         | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31bd0d97d178..40eabf176ce2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -521,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_REAPED		21	/* mm has been already reaped */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 415f7eb913fa..c0376efa79ec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
+	/*
+	 * Do not even consider tasks which are explicitly marked oom
+	 * unkillable or have been already oom reaped.
+	 */
 	adj = (long)p->signal->oom_score_adj;
-	if (adj == OOM_SCORE_ADJ_MIN) {
+	if (adj == OOM_SCORE_ADJ_MIN ||
+			test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -513,7 +518,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 * This task can be safely ignored because we cannot do much more
 	 * to release its memory.
 	 */
-	tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
 	mmput(mm);
 	return ret;
-- 
cgit v1.2.3


From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper
 context

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  5 +++++
 kernel/fork.c            | 50 +++++++++++++++++++++++++++++++++---------------
 mm/oom_kill.c            |  8 ++++++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1fda9c99ef95..d553855503e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -513,6 +514,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct work_struct async_put_work;
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40eabf176ce2..479e3cade7e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..8fbed7194af1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
 	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
 	}
 }
-EXPORT_SYMBOL_GPL(mmput);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0376efa79ec..c0e37dd1422f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-
 static bool __oom_reap_task(struct task_struct *tsk)
 {
 	struct mmu_gather tlb;
@@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
-	mmput(mm);
+	/*
+	 * Drop our reference but make sure the mmput slow path is called from a
+	 * different context because we shouldn't risk we get stuck there and
+	 * put the oom_reaper out of the way.
+	 */
+	mmput_async(mm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 98748bd722005be9de2662bd4f7e41ad8148bdbd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:24 -0700
Subject: oom: consider multi-threaded tasks in task_will_free_mem

task_will_free_mem is a misnomer for a more complex PF_EXITING test for
early break out from the oom killer because it is believed that such a
task would release its memory shortly and so we do not have to select an
oom victim and perform a disruptive action.

Currently we make sure that the given task is not participating in the
core dumping because it might get blocked for a long time - see commit
d003f371b270 ("oom: don't assume that a coredumping thread will exit
soon").

The check can still do better though.  We shouldn't consider the task
unless the whole thread group is going down.  This is rather unlikely
but not impossible.  A single exiting thread would surely leave all the
address space behind.  If we are really unlucky it might get stuck on
the exit path and keep its TIF_MEMDIE and so block the oom killer.

Link: http://lkml.kernel.org/r/1460452756-15491-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 83b9c39bd8b7..d3f533f2f481 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -110,13 +110,24 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 static inline bool task_will_free_mem(struct task_struct *task)
 {
+	struct signal_struct *sig = task->signal;
+
 	/*
 	 * A coredumping process may sleep for an extended period in exit_mm(),
 	 * so the oom killer cannot assume that the process will promptly exit
 	 * and release memory.
 	 */
-	return (task->flags & PF_EXITING) &&
-		!(task->signal->flags & SIGNAL_GROUP_COREDUMP);
+	if (sig->flags & SIGNAL_GROUP_COREDUMP)
+		return false;
+
+	if (!(task->flags & PF_EXITING))
+		return false;
+
+	/* Make sure that the whole thread group is going down */
+	if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
+		return false;
+
+	return true;
 }
 
 /* sysctls */
-- 
cgit v1.2.3


From f44666b04605d1c7fd94ab90b7ccf633e7eff228 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 20 May 2016 16:57:27 -0700
Subject: mm,oom: speed up select_bad_process() loop

Since commit 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel
panics"), select_bad_process() is using for_each_process_thread().

Since oom_unkillable_task() scans all threads in the caller's thread
group and oom_task_origin() scans signal_struct of the caller's thread
group, we don't need to call oom_unkillable_task() and oom_task_origin()
on each thread.  Also, since !mm test will be done later at
oom_badness(), we don't need to do !mm test on each thread.  Therefore,
we only need to do TIF_MEMDIE test on each thread.

Although the original code was correct it was quite inefficient because
each thread group was scanned num_threads times which can be a lot
especially with processes with many threads.  Even though the OOM is
extremely cold path it is always good to be as effective as possible
when we are inside rcu_read_lock() - aka unpreemptible context.

If we track number of TIF_MEMDIE threads inside signal_struct, we don't
need to do TIF_MEMDIE test on each thread.  This will allow
select_bad_process() to use for_each_process().

This patch adds a counter to signal_struct for tracking how many
TIF_MEMDIE threads are in a given thread group, and check it at
oom_scan_process_thread() so that select_bad_process() can use
for_each_process() rather than for_each_process_thread().

[mhocko@suse.com: do not blow the signal_struct size]
  Link: http://lkml.kernel.org/r/20160520075035.GF19172@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/201605182230.IDC73435.MVSOHLFOQFOJtF@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 17 ++++++-----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 479e3cade7e9..01fe1bb68754 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -669,6 +669,7 @@ struct signal_struct {
 	atomic_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
+	atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
 	struct list_head	thread_head;
 
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0e37dd1422f..5bb2f7698ad7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -283,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	 * This task already has access to memory reserves and is being killed.
 	 * Don't allow any other task to have access to the reserves.
 	 */
-	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (!is_sysrq_oom(oc))
-			return OOM_SCAN_ABORT;
-	}
-	if (!task->mm)
-		return OOM_SCAN_CONTINUE;
+	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
+		return OOM_SCAN_ABORT;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -307,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 static struct task_struct *select_bad_process(struct oom_control *oc,
 		unsigned int *ppoints, unsigned long totalpages)
 {
-	struct task_struct *g, *p;
+	struct task_struct *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
 
 	rcu_read_lock();
-	for_each_process_thread(g, p) {
+	for_each_process(p) {
 		unsigned int points;
 
 		switch (oom_scan_process_thread(oc, p, totalpages)) {
@@ -331,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
 		points = oom_badness(p, NULL, oc->nodemask, totalpages);
 		if (!points || points < chosen_points)
 			continue;
-		/* Prefer thread group leaders for display purposes */
-		if (points == chosen_points && thread_group_leader(chosen))
-			continue;
 
 		chosen = p;
 		chosen_points = points;
@@ -673,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk)
 	/* OOM killer might race with memcg OOM */
 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_inc(&tsk->signal->oom_victims);
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -690,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk)
 {
 	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_dec(&tsk->signal->oom_victims);
 
 	if (!atomic_dec_return(&oom_victims))
 		wake_up_all(&oom_victims_wait);
-- 
cgit v1.2.3


From 80c4bd7a5e4368b680e0aeb57050a1b06eb573d8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 May 2016 16:57:38 -0700
Subject: mm/vmalloc: keep a separate lazy-free list

When mixing lots of vmallocs and set_memory_*() (which calls
vm_unmap_aliases()) I encountered situations where the performance
degraded severely due to the walking of the entire vmap_area list each
invocation.

One simple improvement is to add the lazily freed vmap_area to a
separate lockless free list, such that we then avoid having to walk the
full list on each purge.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Roman Pen <r.peniaev@gmail.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Roman Pen <r.peniaev@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  3 ++-
 mm/vmalloc.c            | 39 +++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d1f1d338af20..957adb741b6f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
 #include <linux/rbtree.h>
 
@@ -44,7 +45,7 @@ struct vmap_area {
 	unsigned long flags;
 	struct rb_node rb_node;         /* address sorted rbtree */
 	struct list_head list;          /* address sorted list */
-	struct list_head purge_list;    /* "lazy purge" list */
+	struct llist_node purge_list;    /* "lazy purge" list */
 	struct vm_struct *vm;
 	struct rcu_head rcu_head;
 };
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae7d20b447ff..6e3291882739 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 /*** Global kva allocator ***/
 
-#define VM_LAZY_FREE	0x01
-#define VM_LAZY_FREEING	0x02
 #define VM_VM_AREA	0x04
 
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 					int sync, int force_flush)
 {
 	static DEFINE_SPINLOCK(purge_lock);
-	LIST_HEAD(valist);
+	struct llist_node *valist;
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	int nr = 0;
@@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	if (sync)
 		purge_fragmented_blocks_allcpus();
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(va, &vmap_area_list, list) {
-		if (va->flags & VM_LAZY_FREE) {
-			if (va->va_start < *start)
-				*start = va->va_start;
-			if (va->va_end > *end)
-				*end = va->va_end;
-			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
-			list_add_tail(&va->purge_list, &valist);
-			va->flags |= VM_LAZY_FREEING;
-			va->flags &= ~VM_LAZY_FREE;
-		}
+	valist = llist_del_all(&vmap_purge_list);
+	llist_for_each_entry(va, valist, purge_list) {
+		if (va->va_start < *start)
+			*start = va->va_start;
+		if (va->va_end > *end)
+			*end = va->va_end;
+		nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
 	}
-	rcu_read_unlock();
 
 	if (nr)
 		atomic_sub(nr, &vmap_lazy_nr);
@@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 
 	if (nr) {
 		spin_lock(&vmap_area_lock);
-		list_for_each_entry_safe(va, n_va, &valist, purge_list)
+		llist_for_each_entry_safe(va, n_va, valist, purge_list)
 			__free_vmap_area(va);
 		spin_unlock(&vmap_area_lock);
 	}
@@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	va->flags |= VM_LAZY_FREE;
-	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
-	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+	int nr_lazy;
+
+	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+				    &vmap_lazy_nr);
+
+	/* After this point, we may free va at any time */
+	llist_add(&va->purge_list, &vmap_purge_list);
+
+	if (unlikely(nr_lazy > lazy_max_pages()))
 		try_purge_vmap_area_lazy();
 }
 
-- 
cgit v1.2.3


From b8ca9e3a612eaf3e54c6fa136c62246a1a9aece7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 May 2016 16:57:53 -0700
Subject: mm: tighten fault_in_pages_writeable()

copy_page_to_iter_iovec() is currently the only user of
fault_in_pages_writeable(), and it definitely can use fragments from
high order pages.

Make sure fault_in_pages_writeable() is only touching two adjacent pages
at most, as claimed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fe1513ffb7bf..97354102794d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -518,33 +518,27 @@ void page_endio(struct page *page, int rw, int err);
 extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
 
 /*
- * Fault a userspace page into pagetables.  Return non-zero on a fault.
- *
- * This assumes that two userspace pages are always sufficient.
+ * Fault one or two userspace pages into pagetables.
+ * Return -EINVAL if more than two pages would be needed.
+ * Return non-zero on a fault.
  */
 static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 {
-	int ret;
+	int span, ret;
 
 	if (unlikely(size == 0))
 		return 0;
 
+	span = offset_in_page(uaddr) + size;
+	if (span > 2 * PAGE_SIZE)
+		return -EINVAL;
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
 	 * the zero gets there, we'll be overwriting it.
 	 */
 	ret = __put_user(0, uaddr);
-	if (ret == 0) {
-		char __user *end = uaddr + size - 1;
-
-		/*
-		 * If the page was already mapped, this will get a cache miss
-		 * for sure, so try to avoid doing it.
-		 */
-		if (((unsigned long)uaddr & PAGE_MASK) !=
-				((unsigned long)end & PAGE_MASK))
-			ret = __put_user(0, end);
-	}
+	if (ret == 0 && span > PAGE_SIZE)
+		ret = __put_user(0, uaddr + size - 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 7fab358d90e6ba9d9cb702bee0c8a5f5c13bb6df Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen.5i5j@gmail.com>
Date: Fri, 20 May 2016 16:57:59 -0700
Subject: include/linux/hugetlb*.h: clean up code

Macro HUGETLBFS_SB is clear enough, so one statement is clearer than 3
lines statements.

Remove redundant return statements for non-return functions, which can
save lines, at least.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h        | 4 +---
 include/linux/hugetlb_cgroup.h | 4 ----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e44c57876e89..7ef4b635015d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -353,9 +353,7 @@ extern unsigned int default_hstate_idx;
 
 static inline struct hstate *hstate_inode(struct inode *i)
 {
-	struct hugetlbfs_sb_info *hsb;
-	hsb = HUGETLBFS_SB(i->i_sb);
-	return hsb->hstate;
+	return HUGETLBFS_SB(i->i_sb)->hstate;
 }
 
 static inline struct hstate *hstate_file(struct file *f)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 24154c26d469..063962f6dfc6 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -93,20 +93,17 @@ hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 			     struct hugetlb_cgroup *h_cg,
 			     struct page *page)
 {
-	return;
 }
 
 static inline void
 hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
 {
-	return;
 }
 
 static inline void
 hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 			       struct hugetlb_cgroup *h_cg)
 {
-	return;
 }
 
 static inline void hugetlb_cgroup_file_init(void)
@@ -116,7 +113,6 @@ static inline void hugetlb_cgroup_file_init(void)
 static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
 					  struct page *newhpage)
 {
-	return;
 }
 
 #endif  /* CONFIG_MEM_RES_CTLR_HUGETLB */
-- 
cgit v1.2.3


From d70c17d436b3fb9dbdae8c93bf908a6110b0cb4f Mon Sep 17 00:00:00 2001
From: Chen Gang <chengang@emindsoft.com.cn>
Date: Fri, 20 May 2016 16:58:01 -0700
Subject: include/linux/hugetlb.h: use bool instead of int for
 hugepage_migration_supported()

It is used as a pure bool function within kernel source wide.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7ef4b635015d..c26d4638f665 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -452,12 +452,12 @@ static inline pgoff_t basepage_index(struct page *page)
 
 extern void dissolve_free_huge_pages(unsigned long start_pfn,
 				     unsigned long end_pfn);
-static inline int hugepage_migration_supported(struct hstate *h)
+static inline bool hugepage_migration_supported(struct hstate *h)
 {
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	return huge_page_shift(h) == PMD_SHIFT;
 #else
-	return 0;
+	return false;
 #endif
 }
 
@@ -519,7 +519,7 @@ static inline pgoff_t basepage_index(struct page *page)
 	return page->index;
 }
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
-#define hugepage_migration_supported(h)	0
+#define hugepage_migration_supported(h)	false
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 					   struct mm_struct *mm, pte_t *pte)
-- 
cgit v1.2.3


From 0c9ad804f178eae02a34045bb0916fa0e31623d5 Mon Sep 17 00:00:00 2001
From: Weijie Yang <weijie.yang@samsung.com>
Date: Fri, 20 May 2016 16:58:04 -0700
Subject: mm fix commmets: if SPARSEMEM, pgdata doesn't have page_ext

If SPARSEMEM, use page_ext in mem_section
if !SPARSEMEM, use page_ext in pgdata

Signed-off-by: Weijie Yang <weijie.yang@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8dd0333b01dc..02069c23486d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1063,7 +1063,7 @@ struct mem_section {
 	unsigned long *pageblock_flags;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
-	 * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use
+	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
 	 * section. (see page_ext.h about this.)
 	 */
 	struct page_ext *page_ext;
-- 
cgit v1.2.3


From d2a1a1f0a97a77d25cbada37161dc2ecdf01f93d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Fri, 20 May 2016 16:58:16 -0700
Subject: mm: use unsigned long constant for page flags

struct page->flags is unsigned long, so when shifting bits we should use
UL suffix to match it.

Found this problem after I added 64-bit CPU specific page flags and
failed to compile the kernel:

  mm/page_alloc.c: In function '__free_one_page':
  mm/page_alloc.c:672:2: error: integer overflow in expression [-Werror=overflow]

Link: http://lkml.kernel.org/r/1461971723-16187-1-git-send-email-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a61e06e5fbce..e5a32445f930 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -479,7 +479,7 @@ static inline void ClearPageCompound(struct page *page)
 }
 #endif
 
-#define PG_head_mask ((1L << PG_head))
+#define PG_head_mask ((1UL << PG_head))
 
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
@@ -670,7 +670,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 }
 
 #ifdef CONFIG_MMU
-#define __PG_MLOCKED		(1 << PG_mlocked)
+#define __PG_MLOCKED		(1UL << PG_mlocked)
 #else
 #define __PG_MLOCKED		0
 #endif
@@ -680,11 +680,11 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * these flags set.  It they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE \
-	(1 << PG_lru	 | 1 << PG_locked    | \
-	 1 << PG_private | 1 << PG_private_2 | \
-	 1 << PG_writeback | 1 << PG_reserved | \
-	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED)
+	(1UL << PG_lru	 | 1UL << PG_locked    | \
+	 1UL << PG_private | 1UL << PG_private_2 | \
+	 1UL << PG_writeback | 1UL << PG_reserved | \
+	 1UL << PG_slab	 | 1UL << PG_swapcache | 1UL << PG_active | \
+	 1UL << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -695,10 +695,10 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * alloc-free cycle to prevent from reusing the page.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	\
-	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+	(((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
-	(1 << PG_private | 1 << PG_private_2)
+	(1UL << PG_private | 1UL << PG_private_2)
 /**
  * page_has_private - Determine if page has private stuff
  * @page: The page to be checked
-- 
cgit v1.2.3


From 5f527c2b3ea261bfccb7d12f9feade924cc4987c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 20 May 2016 16:58:24 -0700
Subject: mm: thp: microoptimize compound_mapcount()

compound_mapcount() is only called after PageCompound() has already been
checked by the caller, so there's no point to check it again.  Gcc may
optimize it away too because it's inline but this will remove the
runtime check for sure and add it'll add an assert instead.

Link: http://lkml.kernel.org/r/1462547040-1737-3-git-send-email-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2b97be1147ec..65d18a45b8e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -475,8 +475,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
 
 static inline int compound_mapcount(struct page *page)
 {
-	if (!PageCompound(page))
-		return 0;
+	VM_BUG_ON_PAGE(!PageCompound(page), page);
 	page = compound_head(page);
 	return atomic_read(compound_mapcount_ptr(page)) + 1;
 }
-- 
cgit v1.2.3


From d2005e3f41d4f9299e2df6a967c8beb5086967a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 20 May 2016 16:58:36 -0700
Subject: userfaultfd: don't pin the user memory in userfaultfd_file_create()

userfaultfd_file_create() increments mm->mm_users; this means that the
memory won't be unmapped/freed if mm owner exits/execs, and UFFDIO_COPY
after that can populate the orphaned mm more.

Change userfaultfd_file_create() and userfaultfd_ctx_put() to use
mm->mm_count to pin mm_struct.  This means that
atomic_inc_not_zero(mm->mm_users) is needed when we are going to
actually play with this memory.  Except handle_userfault() path doesn't
need this, the caller must already have a reference.

The patch adds the new trivial helper, mmget_not_zero(), it can have
more users.

Link: http://lkml.kernel.org/r/20160516172254.GA8595@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c      | 41 ++++++++++++++++++++++++++++-------------
 include/linux/sched.h |  7 ++++++-
 2 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 66cdb44616d5..2d97952e341a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
-		mmput(ctx->mm);
+		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 }
@@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 
 	ACCESS_ONCE(ctx->released) = true;
 
+	if (!mmget_not_zero(mm))
+		goto wakeup;
+
 	/*
 	 * Flush page faults out of all CPUs. NOTE: all page faults
 	 * must be retried without returning VM_FAULT_SIGBUS if
@@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
 	up_write(&mm->mmap_sem);
-
+	mmput(mm);
+wakeup:
 	/*
 	 * After no new page faults can wait on this fault_*wqh, flush
 	 * the last page faults that may have been already waiting on
@@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	start = uffdio_register.range.start;
 	end = start + uffdio_register.range.len;
 
+	ret = -ENOMEM;
+	if (!mmget_not_zero(mm))
+		goto out;
+
 	down_write(&mm->mmap_sem);
 	vma = find_vma_prev(mm, start, &prev);
-
-	ret = -ENOMEM;
 	if (!vma)
 		goto out_unlock;
 
@@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	} while (vma && vma->vm_start < end);
 out_unlock:
 	up_write(&mm->mmap_sem);
+	mmput(mm);
 	if (!ret) {
 		/*
 		 * Now that we scanned all vmas we can already tell
@@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	start = uffdio_unregister.start;
 	end = start + uffdio_unregister.len;
 
+	ret = -ENOMEM;
+	if (!mmget_not_zero(mm))
+		goto out;
+
 	down_write(&mm->mmap_sem);
 	vma = find_vma_prev(mm, start, &prev);
-
-	ret = -ENOMEM;
 	if (!vma)
 		goto out_unlock;
 
@@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	} while (vma && vma->vm_start < end);
 out_unlock:
 	up_write(&mm->mmap_sem);
+	mmput(mm);
 out:
 	return ret;
 }
@@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
 		goto out;
-
-	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-			   uffdio_copy.len);
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+				   uffdio_copy.len);
+		mmput(ctx->mm);
+	}
 	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
 		return -EFAULT;
 	if (ret < 0)
@@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
 		goto out;
 
-	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-			     uffdio_zeropage.range.len);
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+				     uffdio_zeropage.range.len);
+		mmput(ctx->mm);
+	}
 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
 		return -EFAULT;
 	if (ret < 0)
@@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags)
 	ctx->released = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
-	atomic_inc(&ctx->mm->mm_users);
+	atomic_inc(&ctx->mm->mm_count);
 
 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
 	if (IS_ERR(file)) {
-		mmput(ctx->mm);
+		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 out:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 01fe1bb68754..6b3213d96da6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2723,12 +2723,17 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct * mm)
+static inline void mmdrop(struct mm_struct *mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+static inline bool mmget_not_zero(struct mm_struct *mm)
+{
+	return atomic_inc_not_zero(&mm->mm_users);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* same as above but performs the slow path from the async kontext. Can
-- 
cgit v1.2.3


From 4b50bcc7eda4d3cc9e3f2a0aa60e590fedf728c5 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Fri, 20 May 2016 16:58:38 -0700
Subject: mm: use phys_addr_t for reserve_bootmem_region() arguments

Since commit 92923ca3aace ("mm: meminit: only set page reserved in the
memblock region") the reserved bit is set on reserved memblock regions.
However start and end address are passed as unsigned long.  This is only
32bit on i386, so it can end up marking the wrong pages reserved for
ranges at 4GB and above.

This was observed on a 32bit Xen dom0 which was booted with initial
memory set to a value below 4G but allowing to balloon in memory
(dom0_mem=1024M for example).  This would define a reserved bootmem
region for the additional memory (for example on a 8GB system there was
a reverved region covering the 4GB-8GB range).  But since the addresses
were passed on as unsigned long, this was actually marking all pages
from 0 to 4GB as reserved.

Fixes: 92923ca3aacef63 ("mm: meminit: only set page reserved in the memblock region")
Link: http://lkml.kernel.org/r/1463491221-10573-1-git-send-email-stefan.bader@canonical.com
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Cc: <stable@vger.kernel.org>	[4.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/page_alloc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65d18a45b8e8..fbdb9d40847f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1763,7 +1763,7 @@ extern void free_highmem_page(struct page *page);
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
-extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f4b69aaa23a..2dd1ba4e70cc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1205,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn)
  * marks the pages PageReserved. The remaining valid pages are later
  * sent to the buddy page allocator.
  */
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long end_pfn = PFN_UP(end);
-- 
cgit v1.2.3


From 5c0a85fad949212b3e059692deecdeed74ae7ec7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 20 May 2016 16:58:41 -0700
Subject: mm: make faultaround produce old ptes

Currently, faultaround code produces young pte.  This can screw up
vmscan behaviour[1], as it makes vmscan think that these pages are hot
and not push them out on first round.

During sparse file access faultaround gets more pages mapped and all of
them are young.  Under memory pressure, this makes vmscan swap out anon
pages instead, or to drop other page cache pages which otherwise stay
resident.

Modify faultaround to produce old ptes, so they can easily be reclaimed
under memory pressure.

This can to some extend defeat the purpose of faultaround on machines
without hardware accessed bit as it will not help us with reducing the
number of minor page faults.

We may want to disable faultaround on such machines altogether, but
that's subject for separate patchset.

Minchan:
 "I tested 512M mmap sequential word read test on non-HW access bit
  system (i.e., ARM) and confirmed it doesn't increase minor fault any
  more.

  old: 4096 fault_around
  minor fault: 131291
  elapsed time: 6747645 usec

  new: 65536 fault_around
  minor fault: 131291
  elapsed time: 6709263 usec

  0.56% benefit"

[1] https://lkml.kernel.org/r/1460992636-711-1-git-send-email-vinmenon@codeaurora.org

Link: http://lkml.kernel.org/r/1463488366-47723-1-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/filemap.c       |  2 +-
 mm/memory.c        | 23 ++++++++++++++++++-----
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fbdb9d40847f..f223ac26b5d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon);
+		struct page *page, pte_t *pte, bool write, bool anon, bool old);
 #endif
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 8f4859989f1b..b418405903bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2191,7 +2191,7 @@ repeat:
 		if (file->f_ra.mmap_miss > 0)
 			file->f_ra.mmap_miss--;
 		addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-		do_set_pte(vma, addr, page, pte, false, false);
+		do_set_pte(vma, addr, page, pte, false, false, true);
 		unlock_page(page);
 		goto next;
 unlock:
diff --git a/mm/memory.c b/mm/memory.c
index 007c72ad03f6..f29e5ab0342d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2876,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
  * vm_ops->map_pages.
  */
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon)
+		struct page *page, pte_t *pte, bool write, bool anon, bool old)
 {
 	pte_t entry;
 
@@ -2884,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	if (old)
+		entry = pte_mkold(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address, false);
@@ -3021,9 +3023,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
 			goto unlock_out;
+		do_fault_around(vma, address, pte, pgoff, flags);
+		/* Check if the fault is handled by faultaround */
+		if (!pte_same(*pte, orig_pte)) {
+			/*
+			 * Faultaround produce old pte, but the pte we've
+			 * handler fault for should be young.
+			 */
+			pte_t entry = pte_mkyoung(*pte);
+			if (ptep_set_access_flags(vma, address, pte, entry, 0))
+				update_mmu_cache(vma, address, pte);
+			goto unlock_out;
+		}
 		pte_unmap_unlock(pte, ptl);
 	}
 
@@ -3038,7 +3051,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, false, false);
+	do_set_pte(vma, address, fault_page, pte, false, false, false);
 	unlock_page(fault_page);
 unlock_out:
 	pte_unmap_unlock(pte, ptl);
@@ -3090,7 +3103,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		goto uncharge_out;
 	}
-	do_set_pte(vma, address, new_page, pte, true, true);
+	do_set_pte(vma, address, new_page, pte, true, true, false);
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
@@ -3147,7 +3160,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, true, false);
+	do_set_pte(vma, address, fault_page, pte, true, false, false);
 	pte_unmap_unlock(pte, ptl);
 
 	if (set_page_dirty(fault_page))
-- 
cgit v1.2.3


From 0bb2fd13b69abfd88880f356903b5c7ca36d5eea Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 20 May 2016 16:58:59 -0700
Subject: mm: page_is_guard(): return false when page_ext arrays are not
 allocated yet

When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
  PGD 0
  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in:
  CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
  Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009
  task: ffff88017c080040 ti: ffff88017c084000 task.ti: ffff88017c084000
  RIP: 0010:[<ffffffff8118d982>]  [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
  RSP: 0000:ffff88017c087c48  EFLAGS: 00010046
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
  RDX: 0000000000000980 RSI: 0000000000000080 RDI: 0000000000660401
  RBP: ffff88017c087cd0 R08: 0000000000000401 R09: 0000000000000009
  R10: ffff88017c080040 R11: 000000000000000a R12: 0000000000000400
  R13: ffffea0019810000 R14: ffffea0019810040 R15: ffff88066cfe6080
  FS:  0000000000000000(0000) GS:ffff88066cd40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 0000000002406000 CR4: 00000000000006e0
  Call Trace:
    free_hot_cold_page+0x192/0x1d0
    __free_pages+0x5c/0x90
    __free_pages_boot_core+0x11a/0x14e
    deferred_free_range+0x50/0x62
    deferred_init_memmap+0x220/0x3c3
    kthread+0xf8/0x110
    ret_from_fork+0x22/0x40
  Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
  RIP  [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
   RSP <ffff88017c087c48>
  CR2: 0000000000000000

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but freeing page might reach here before the page_ext arrays are
allocated when feeding a range of pages to the allocator for the first
time during bootup or memory hotplug.

When it returns NULL, page_is_guard() should just return false instead
of checking PAGE_EXT_DEBUG_GUARD unconditionally.

Link: http://lkml.kernel.org/r/1463610225-29060-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f223ac26b5d9..b530c99e8e81 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2386,6 +2386,9 @@ static inline bool page_is_guard(struct page *page)
 		return false;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return false;
+
 	return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 }
 #else
-- 
cgit v1.2.3


From 55834c59098d0c5a97b0f3247e55832b67facdcf Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:11 -0700
Subject: mm: kasan: initial memory quarantine implementation

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE.  The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.

When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator.  From now on the
allocator may reuse it for another allocation.  Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).

When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped.  Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.

Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

Freed objects are first added to per-cpu quarantine queues.  When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue.  Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).

As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased.  Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.

Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.

This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov.  A number of improvements have been
suggested by Andrey Ryabinin.

[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  13 ++-
 mm/kasan/Makefile     |   1 +
 mm/kasan/kasan.c      |  57 ++++++++--
 mm/kasan/kasan.h      |  21 +++-
 mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/report.c     |   1 +
 mm/mempool.c          |   2 +-
 mm/slab.c             |  12 ++-
 mm/slab.h             |   2 +
 mm/slab_common.c      |   2 +
 10 files changed, 387 insertions(+), 15 deletions(-)
 create mode 100644 mm/kasan/quarantine.c

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 737371b56044..611927f5870d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_destroy(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
-void kasan_slab_free(struct kmem_cache *s, void *object);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size,
 
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags) {}
-static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 131daadf40e4..1548749a3d45 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
 obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..8df666bb23be 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 }
 #endif
 
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return;
 
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
 #ifdef CONFIG_SLAB
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_free_meta *free_info =
-			get_free_info(cache, object);
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return false;
+
+	if (likely(cache->flags & SLAB_KASAN)) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
-		alloc_info->state = KASAN_STATE_FREE;
-		set_track(&free_info->track, GFP_NOWAIT);
+		struct kasan_free_meta *free_info =
+			get_free_info(cache, object);
+
+		switch (alloc_info->state) {
+		case KASAN_STATE_ALLOC:
+			alloc_info->state = KASAN_STATE_QUARANTINE;
+			quarantine_put(free_info, cache);
+			set_track(&free_info->track, GFP_NOWAIT);
+			kasan_poison_slab_free(cache, object);
+			return true;
+		case KASAN_STATE_QUARANTINE:
+		case KASAN_STATE_FREE:
+			pr_err("Double free");
+			dump_stack();
+			break;
+		default:
+			break;
+		}
 	}
+	return false;
+#else
+	kasan_poison_slab_free(cache, object);
+	return false;
 #endif
-
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(object == NULL))
 		return;
 
@@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(ptr == NULL))
 		return;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 30a2f0ba0e09..7f7ac51d7faf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,6 +62,7 @@ struct kasan_global {
 enum kasan_state {
 	KASAN_STATE_INIT,
 	KASAN_STATE_ALLOC,
+	KASAN_STATE_QUARANTINE,
 	KASAN_STATE_FREE
 };
 
@@ -79,9 +80,14 @@ struct kasan_alloc_meta {
 	u32 reserved;
 };
 
+struct qlist_node {
+	struct qlist_node *next;
+};
 struct kasan_free_meta {
-	/* Allocator freelist pointer, unused by KASAN. */
-	void **freelist;
+	/* This field is used while the object is in the quarantine.
+	 * Otherwise it might be used for the allocator freelist.
+	 */
+	struct qlist_node quarantine_link;
 	struct kasan_track track;
 };
 
@@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+				struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
 #endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..4973505a9bdd
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+	struct qlist_node *head;
+	struct qlist_node *tail;
+	size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+	return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+	q->head = q->tail = NULL;
+	q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+		size_t size)
+{
+	if (unlikely(qlist_empty(q)))
+		q->head = qlink;
+	else
+		q->tail->next = qlink;
+	q->tail = qlink;
+	qlink->next = NULL;
+	q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	if (qlist_empty(to)) {
+		*to = *from;
+		qlist_init(from);
+		return;
+	}
+
+	to->tail->next = from->head;
+	to->tail = from->tail;
+	to->bytes += from->bytes;
+
+	qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+		struct qlist_head *to, size_t size)
+{
+	if (unlikely(last == from->tail)) {
+		qlist_move_all(from, to);
+		return;
+	}
+	if (qlist_empty(to))
+		to->head = from->head;
+	else
+		to->tail->next = from->head;
+	to->tail = last;
+	from->head = last->next;
+	last->next = NULL;
+	from->bytes -= size;
+	to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+	return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	struct kasan_free_meta *free_info =
+		container_of(qlink, struct kasan_free_meta,
+			     quarantine_link);
+
+	return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	void *object = qlink_to_object(qlink, cache);
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	alloc_info->state = KASAN_STATE_FREE;
+	___cache_free(cache, object, _THIS_IP_);
+	local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+	struct qlist_node *qlink;
+
+	if (unlikely(qlist_empty(q)))
+		return;
+
+	qlink = q->head;
+	while (qlink) {
+		struct kmem_cache *obj_cache =
+			cache ? cache :	qlink_to_cache(qlink);
+		struct qlist_node *next = qlink->next;
+
+		qlink_free(qlink, obj_cache);
+		qlink = next;
+	}
+	qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head *q;
+	struct qlist_head temp = QLIST_INIT;
+
+	local_irq_save(flags);
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_put(q, &info->quarantine_link, cache->size);
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+		qlist_move_all(q, &temp);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!qlist_empty(&temp))) {
+		spin_lock_irqsave(&quarantine_lock, flags);
+		qlist_move_all(&temp, &global_quarantine);
+		spin_unlock_irqrestore(&quarantine_lock, flags);
+	}
+}
+
+void quarantine_reduce(void)
+{
+	size_t new_quarantine_size;
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+	size_t size_to_free = 0;
+	struct qlist_node *last;
+
+	if (likely(READ_ONCE(global_quarantine.bytes) <=
+		   READ_ONCE(quarantine_size)))
+		return;
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+
+	/*
+	 * Update quarantine size in case of hotplug. Allocate a fraction of
+	 * the installed memory to quarantine minus per-cpu queue limits.
+	 */
+	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+		QUARANTINE_FRACTION;
+	new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	WRITE_ONCE(quarantine_size, new_quarantine_size);
+
+	last = global_quarantine.head;
+	while (last) {
+		struct kmem_cache *cache = qlink_to_cache(last);
+
+		size_to_free += cache->size;
+		if (!last->next || size_to_free >
+		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+			break;
+		last = last->next;
+	}
+	qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+				   struct qlist_head *to,
+				   struct kmem_cache *cache)
+{
+	struct qlist_node *prev = NULL, *curr;
+
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	curr = from->head;
+	while (curr) {
+		struct qlist_node *qlink = curr;
+		struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+		if (obj_cache == cache) {
+			if (unlikely(from->head == qlink)) {
+				from->head = curr->next;
+				prev = curr;
+			} else
+				prev->next = curr->next;
+			if (unlikely(from->tail == qlink))
+				from->tail = curr->next;
+			from->bytes -= cache->size;
+			qlist_put(to, qlink, cache->size);
+		} else {
+			prev = curr;
+		}
+		curr = curr->next;
+	}
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct kmem_cache *cache = arg;
+	struct qlist_head to_free = QLIST_INIT;
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_move_cache(q, &to_free, cache);
+	qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+
+	on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+	qlist_move_cache(&global_quarantine, &to_free, cache);
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 60869a5a0124..b3c122ddd454 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
 		print_track(&alloc_info->track);
 		break;
 	case KASAN_STATE_FREE:
+	case KASAN_STATE_QUARANTINE:
 		pr_err("Object freed, allocated with size %u bytes\n",
 		       alloc_info->alloc_size);
 		free_info = get_free_info(cache, object);
diff --git a/mm/mempool.c b/mm/mempool.c
index 9b7a14a791cc..9e075f829d0d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_free(pool->pool_data, element);
+		kasan_poison_slab_free(pool->pool_data, element);
 	if (pool->alloc == mempool_kmalloc)
 		kasan_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/slab.c b/mm/slab.c
index c11bf5007952..28864c022430 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3547,9 +3547,17 @@ free_done:
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	/* Put the object into the quarantine, don't touch it for now. */
+	if (kasan_slab_free(cachep, objp))
+		return;
+
+	___cache_free(cachep, objp, caller);
+}
 
-	kasan_slab_free(cachep, objp);
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+		unsigned long caller)
+{
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
diff --git a/mm/slab.h b/mm/slab.h
index 5969769fbee6..dedb1a920fb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3239bfd758e6..a65dad7fdcd1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
+	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
@@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 
 	get_online_cpus();
 	get_online_mems();
+	kasan_cache_shrink(cachep);
 	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();
-- 
cgit v1.2.3


From 64f8ebaf115bcddc4aaa902f981c57ba6506bc42 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:28 -0700
Subject: mm/kasan: add API to check memory regions

Memory access coded in an assembly won't be seen by KASAN as a compiler
can instrument only C code.  Add kasan_check_[read,write]() API which is
going to be used to check a certain memory range.

Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                  |  2 +-
 include/linux/kasan-checks.h | 12 ++++++++++++
 mm/kasan/kasan.c             | 12 ++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kasan-checks.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 374ffa2d81b7..8b92445561b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,7 +6242,7 @@ S:	Maintained
 F:	arch/*/include/asm/kasan.h
 F:	arch/*/mm/kasan_init*
 F:	Documentation/kasan.txt
-F:	include/linux/kasan.h
+F:	include/linux/kasan*.h
 F:	lib/test_kasan.c
 F:	mm/kasan/
 F:	scripts/Makefile.kasan
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
new file mode 100644
index 000000000000..b7f8aced7870
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e5beb40d97b1..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
-- 
cgit v1.2.3


From d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:48 -0700
Subject: zsmalloc: require GFP in zs_malloc()

Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to
zs_create_pool(), so we can be more flexible, but, more importantly, we
need this to switch zram to per-cpu compression streams -- zram will try
to allocate handle with preemption disabled in a fast path and switch to
a slow path (using different gfp mask) if the fast one has failed.

Apart from that, this also align zs_malloc() interface with zspool/zbud.

[sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask]
  Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c |  4 ++--
 include/linux/zsmalloc.h      |  4 ++--
 mm/zsmalloc.c                 | 24 +++++++++++++-----------
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..b09acdb753ee 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+	meta->mem_pool = zs_create_pool(pool_name);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen);
+	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
 	if (!handle) {
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb16098a33..57a8e98f2708 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -41,10 +41,10 @@ struct zs_pool_stats {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name);
 void zs_destroy_pool(struct zs_pool *pool);
 
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index ae288c9f7156..aba39a291523 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
 	struct size_class **size_class;
 	struct kmem_cache *handle_cachep;
 
-	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
 	struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
 	kmem_cache_destroy(pool->handle_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+			gfp & ~__GFP_HIGHMEM);
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
 			     const struct zpool_ops *zpool_ops,
 			     struct zpool *zpool)
 {
-	return zs_create_pool(name, gfp);
+	/*
+	 * Ignore global gfp flags: zs_malloc() may be invoked from
+	 * different contexts and its caller must provide a valid
+	 * gfp mask.
+	 */
+	return zs_create_pool(name);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool)
 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
-	*handle = zs_malloc(pool, size);
+	*handle = zs_malloc(pool, size, gfp);
 	return *handle ? 0 : -1;
 }
 static void zs_zpool_free(void *pool, unsigned long handle)
@@ -1391,7 +1395,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * otherwise 0.
  * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
 	unsigned long handle, obj;
 	struct size_class *class;
@@ -1400,7 +1404,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
 		return 0;
 
-	handle = alloc_handle(pool);
+	handle = alloc_handle(pool, gfp);
 	if (!handle)
 		return 0;
 
@@ -1413,7 +1417,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(class, gfp);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1878,7 +1882,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
 {
 	int i;
 	struct zs_pool *pool;
@@ -1948,8 +1952,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 		prev_class = class;
 	}
 
-	pool->flags = flags;
-
 	if (zs_pool_stat_create(pool, name))
 		goto err;
 
-- 
cgit v1.2.3


From 5f56a5dfdb9bcb3bca03df59980d4d2f012cbb53 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:16 -0700
Subject: exit_thread: remove empty bodies

Define HAVE_EXIT_THREAD for archs which want to do something in
exit_thread. For others, let's define exit_thread as an empty inline.

This is a cleanup before we change the prototype of exit_thread to
accept a task parameter.

[akpm@linux-foundation.org: fix mips]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                            |  5 +++++
 arch/alpha/kernel/process.c             |  8 --------
 arch/arc/kernel/process.c               |  7 -------
 arch/arm/Kconfig                        |  1 +
 arch/arm64/kernel/process.c             |  7 -------
 arch/avr32/Kconfig                      |  1 +
 arch/blackfin/include/asm/processor.h   |  7 -------
 arch/c6x/kernel/process.c               |  4 ----
 arch/cris/Kconfig                       |  1 +
 arch/cris/arch-v10/kernel/process.c     |  9 ---------
 arch/frv/include/asm/processor.h        |  7 -------
 arch/h8300/include/asm/processor.h      |  7 -------
 arch/hexagon/kernel/process.c           |  7 -------
 arch/ia64/Kconfig                       |  1 +
 arch/m32r/kernel/process.c              |  9 ---------
 arch/m68k/include/asm/processor.h       |  7 -------
 arch/metag/Kconfig                      |  1 +
 arch/metag/include/asm/processor.h      |  2 --
 arch/microblaze/include/asm/processor.h | 10 ----------
 arch/mips/kernel/process.c              |  4 ----
 arch/mn10300/Kconfig                    |  1 +
 arch/nios2/include/asm/processor.h      |  5 -----
 arch/openrisc/include/asm/processor.h   |  9 ---------
 arch/parisc/kernel/process.c            |  7 -------
 arch/powerpc/kernel/process.c           |  4 ----
 arch/s390/Kconfig                       |  1 +
 arch/score/kernel/process.c             |  2 --
 arch/sh/Kconfig                         |  1 +
 arch/sh/kernel/process_32.c             |  7 -------
 arch/sparc/Kconfig                      |  1 +
 arch/tile/Kconfig                       |  1 +
 arch/um/kernel/process.c                |  4 ----
 arch/unicore32/kernel/process.c         |  7 -------
 arch/x86/Kconfig                        |  1 +
 arch/xtensa/Kconfig                     |  1 +
 include/linux/sched.h                   |  7 +++++++
 36 files changed, 24 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5e7e17..0f298f9123dc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -517,6 +517,11 @@ config HAVE_ARCH_MMAP_RND_BITS
 	  - ARCH_MMAP_RND_BITS_MIN
 	  - ARCH_MMAP_RND_BITS_MAX
 
+config HAVE_EXIT_THREAD
+	bool
+	help
+	  An architecture implements exit_thread.
+
 config ARCH_MMAP_RND_BITS_MIN
 	int
 
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 84d13263ce46..b483156698d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -210,14 +210,6 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void
-exit_thread(void)
-{
-}
-
 void
 flush_thread(void)
 {
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a3f750e76b68..b5db9e7fd649 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -183,13 +183,6 @@ void flush_thread(void)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
 	return 0;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b99d25b4133e..956d3575426c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -50,6 +50,7 @@ config ARM
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
 	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 48eea6866c67..6cd2612236dc 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -200,13 +200,6 @@ void show_regs(struct pt_regs * regs)
 	__show_regs(regs);
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 static void tls_thread_flush(void)
 {
 	asm ("msr tpidr_el0, xzr");
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 18b88779e701..e43519a2ca89 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -4,6 +4,7 @@ config AVR32
 	# that we usually don't need on AVR32.
 	select EXPERT
 	select HAVE_CLK
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select VIRT_TO_BUS
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 7acd46653df3..0c265aba94ad 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -75,13 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 3ae9f5a166a0..0ee7686a78f3 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -82,10 +82,6 @@ void flush_thread(void)
 {
 }
 
-void exit_thread(void)
-{
-}
-
 /*
  * Do necessary setup to start up a newly executed thread.
  */
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 99bda1ba3d2f..5c0ca8ae9293 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -59,6 +59,7 @@ config CRIS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS2
+	select HAVE_EXIT_THREAD if ETRAX_ARCH_V32
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select GPIOLIB
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 02b783457be0..96e5afef6b47 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -35,15 +35,6 @@ void default_idle(void)
 	local_irq_enable();
 }
 
-/*
- * Free current thread data structures etc..
- */
-
-void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /* if the watchdog is enabled, we can simply disable interrupts and go
  * into an eternal loop, and the watchdog will reset the CPU after 0.1s
  * if on the other hand the watchdog wasn't enabled, we just enable it and wait
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index ae8d423e79d9..73f0a79ad8e6 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -96,13 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ...
 #define release_segments(mm)		do { } while (0)
 #define forget_segments()		do { } while (0)
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 54e3fd83c336..111df7397ac7 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -110,13 +110,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a9ebd471823a..d9edfd3fc52a 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -136,13 +136,6 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 /*
  * Some archs flush debug and FPU info here
  */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index b534ebab36ea..f80758cb7157 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
 	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 	select HAVE_UNSTABLE_SCHED_CLOCK
+	select HAVE_EXIT_THREAD
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e69221d581d5..a88b1f01e91f 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -101,15 +101,6 @@ void show_regs(struct pt_regs * regs)
 #endif
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	/* Nothing to do. */
-	DPRINTK("pid = %d\n", current->pid);
-}
-
 void flush_thread(void)
 {
 	DPRINTK("pid = %d\n", current->pid);
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 20dda1d4b860..a6ce2ec8d693 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -153,13 +153,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88da3e31..e47a08d72819 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -11,6 +11,7 @@ config METAG
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h
index 0838ca699764..a0333ebcac35 100644
--- a/arch/metag/include/asm/processor.h
+++ b/arch/metag/include/asm/processor.h
@@ -134,8 +134,6 @@ static inline void release_thread(struct task_struct *dead_task)
 #define copy_segments(tsk, mm)		do { } while (0)
 #define release_segments(mm)		do { } while (0)
 
-extern void exit_thread(void);
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 497a988d79c2..c38d0dd91134 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -70,11 +70,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free all resources held by a thread. */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *t);
 
 extern unsigned long get_wchan(struct task_struct *p);
@@ -127,11 +122,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.  */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved (kernel) PC of a blocked thread.  */
 #  define thread_saved_pc(tsk)	\
 	((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0)
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index a6b3dc54260a..411c971e3417 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -73,10 +73,6 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 	regs->regs[29] = sp;
 }
 
-void exit_thread(void)
-{
-}
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	/*
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 06ddb5501ab1..9627e81a6cbb 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -1,5 +1,6 @@
 config MN10300
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_UID16
 	select GENERIC_IRQ_SHOW
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index c2ba45c159c7..1c953f0cadbf 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.. */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved PC of a blocked thread. */
 #define thread_saved_pc(tsk)	((tsk)->thread.kregs->ea)
 
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index 4d235e3d2534..70334c9f7d24 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -84,15 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp);
 void release_thread(struct task_struct *);
 unsigned long get_wchan(struct task_struct *p);
 
-/*
- * Free current thread data structures etc..
- */
-
-extern inline void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /*
  * Return saved PC of a blocked thread. For now, this is the "user" PC
  */
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 809905a811ed..40639439d8b3 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -144,13 +144,6 @@ void machine_power_off(void)
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL(pm_power_off);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	/* Only needs to handle fpu stuff or perf monitors.
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ea8a28fd6f31..e2f12cbcade9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1329,10 +1329,6 @@ void show_regs(struct pt_regs * regs)
 		show_instructions(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index de0fcc08dff5..e2c9aaaf64b2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -134,6 +134,7 @@ config S390
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index a1519ad3d49d..aae9480706c2 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -56,8 +56,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
 	regs->regs[0] = sp;
 }
 
-void exit_thread(void) {}
-
 /*
  * When a process does an "exec", machine state like FPU and debug
  * registers need to be reset.  This is a hook function for that.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc3fc81..cb93af8f8017 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -71,6 +71,7 @@ config SUPERH32
 
 config SUPERH64
 	def_bool ARCH = "sh64"
+	select HAVE_EXIT_THREAD
 	select KALLSYMS
 
 config ARCH_DEFCONFIG
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 2885fc9d9dcd..ee12e9451874 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -76,13 +76,6 @@ void start_thread(struct pt_regs *regs, unsigned long new_pc,
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index db0a26cffa97..27b3a0ad40a0 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -20,6 +20,7 @@ config SPARC
 	select HAVE_OPROFILE
 	select HAVE_ARCH_KGDB if !SMP || SPARC64
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_EXIT_THREAD
 	select SYSCTL_EXCEPTION_TRACE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select RTC_CLASS
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 81719302b056..174746225577 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -3,6 +3,7 @@
 
 config TILE
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_PERF_EVENTS
 	select USE_PMC if PERF_EVENTS
 	select HAVE_DMA_API_DEBUG
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 48af59aae129..0b04711f1f18 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -103,10 +103,6 @@ void interrupt_end(void)
 		tracehook_notify_resume(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 int get_current_pid(void)
 {
 	return task_pid_nr(current);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b008e9961465..00299c927852 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -201,13 +201,6 @@ void show_regs(struct pt_regs *regs)
 	__backtrace();
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct thread_info *thread = current_thread_info();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ace79d2da2c3..8ff5b3be95d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_FP_TEST
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 85257afe71c3..64336f666fb6 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -14,6 +14,7 @@ config XTENSA
 	select GENERIC_PCI_IOMAP
 	select GENERIC_SCHED_CLOCK
 	select HAVE_DMA_API_DEBUG
+	select HAVE_EXIT_THREAD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUTEX_CMPXCHG if !MMU
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6b3213d96da6..167c0d4bf3fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2769,7 +2769,14 @@ static inline int copy_thread_tls(
 }
 #endif
 extern void flush_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
 extern void exit_thread(void);
+#else
+static inline void exit_thread(void)
+{
+}
+#endif
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);
-- 
cgit v1.2.3


From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:20 -0700
Subject: exit_thread: accept a task parameter to be exited

We need to call exit_thread from copy_process in a fail path.  So make it
accept task_struct as a parameter.

[v2]
* s390: exit_thread_runtime_instr doesn't make sense to be called for
  non-current tasks.
* arm: fix the comment in vfp_thread_copy
* change 'me' to 'tsk' for task_struct
* now we can change only archs that actually have exit_thread

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c           |  4 ++--
 arch/arm/vfp/vfpmodule.c            |  4 ----
 arch/avr32/kernel/process.c         |  4 ++--
 arch/cris/arch-v32/kernel/process.c |  4 ++--
 arch/ia64/kernel/perfmon.c          |  4 ++--
 arch/ia64/kernel/process.c          | 14 +++++++-------
 arch/metag/kernel/process.c         |  6 +++---
 arch/mn10300/kernel/process.c       |  4 ++--
 arch/s390/kernel/process.c          |  5 +++--
 arch/sh/kernel/process_64.c         |  5 ++---
 arch/sparc/kernel/process_32.c      | 12 ++++++------
 arch/sparc/kernel/process_64.c      |  4 ++--
 arch/tile/kernel/process.c          |  4 ++--
 arch/x86/kernel/process.c           |  5 ++---
 arch/xtensa/kernel/process.c        |  4 ++--
 include/linux/sched.h               |  4 ++--
 kernel/exit.c                       |  2 +-
 17 files changed, 42 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 4adfb46e3ee9..a647d6642f3e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
+	thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk));
 }
 
 void flush_thread(void)
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 2a61e4b04600..73085d3482ed 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread)
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  *  THREAD_NOTIFY_EXIT
- *   - the thread (v) will be running on the local CPU, so
- *	v === current_thread_info()
- *   - thread->cpu is the local CPU number at the time it is accessed,
- *	but may change at any time.
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 42a53e740a7e..68e5b9dac059 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -62,9 +62,9 @@ void machine_restart(char *cmd)
 /*
  * Free current thread data structures etc
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	ocd_disable(current);
+	ocd_disable(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c7ce784a393c..4d1afa9f9fd3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -33,9 +33,9 @@ void default_idle(void)
  */
 
 extern void deconfigure_bp(long pid);
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	deconfigure_bp(current->pid);
+	deconfigure_bp(tsk->pid);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9cd607b06964..2436ad5f92c1 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 
 
 /*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
+ * called only from exit_thread()
+ * we come here only if the task has a context attached (loaded or masked)
  */
 void
 pfm_exit_thread(struct task_struct *task)
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index b51514957620..aae6c4dc7ae7 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -570,22 +570,22 @@ flush_thread (void)
 }
 
 /*
- * Clean up state associated with current thread.  This is called when
+ * Clean up state associated with a thread.  This is called when
  * the thread calls exit().
  */
 void
-exit_thread (void)
+exit_thread (struct task_struct *tsk)
 {
 
-	ia64_drop_fpu(current);
+	ia64_drop_fpu(tsk);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
-	if (current->thread.pfm_context)
-		pfm_exit_thread(current);
+	if (tsk->thread.pfm_context)
+		pfm_exit_thread(tsk);
 
 	/* free debug register resources */
-	if (current->thread.flags & IA64_THREAD_DBG_VALID)
-		pfm_release_debug_registers(current);
+	if (tsk->thread.flags & IA64_THREAD_DBG_VALID)
+		pfm_release_debug_registers(tsk);
 #endif
 }
 
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 7f546183a0f0..35062796edf2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -345,10 +345,10 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc.
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	clear_fpu(&current->thread);
-	clear_dsp(&current->thread);
+	clear_fpu(&tsk->thread);
+	clear_dsp(&tsk->thread);
 }
 
 /* TODO: figure out how to unwind the kernel stack here to figure out
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 74a96ccf7451..cbede4e88dee 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs)
 /*
  * free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_fpu(current);
+	exit_fpu(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 481d7a83efc6..bba4fa74b321 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -68,9 +68,10 @@ extern void kernel_thread_starter(void);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_thread_runtime_instr();
+	if (tsk == current)
+		exit_thread_runtime_instr();
 }
 
 void flush_thread(void)
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e2062e643341..9d3e9916555d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 	/*
 	 * See arch/sparc/kernel/process.c for the precedent for doing
@@ -307,9 +307,8 @@ void exit_thread(void)
 	 * which it would get safely nulled.
 	 */
 #ifdef CONFIG_SH_FPU
-	if (last_task_used_math == current) {
+	if (last_task_used_math == tsk)
 		last_task_used_math = NULL;
-	}
 #endif
 }
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index c5113c7ce2fd..b7780a5bef11 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifndef CONFIG_SMP
-	if(last_task_used_math == current) {
+	if (last_task_used_math == tsk) {
 #else
-	if (test_thread_flag(TIF_USEDFPU)) {
+	if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
 #endif
 		/* Keep process from leaving FPU in a bogon state. */
 		put_psr(get_psr() | PSR_EF);
-		fpsave(&current->thread.float_regs[0], &current->thread.fsr,
-		       &current->thread.fpqueue[0], &current->thread.fpqdepth);
+		fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr,
+		       &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth);
 #ifndef CONFIG_SMP
 		last_task_used_math = NULL;
 #else
-		clear_thread_flag(TIF_USEDFPU);
+		clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU);
 #endif
 	}
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index c16ef1af1843..fa14402b33f9 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 }
 
 /* Free current thread data structures etc.. */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct thread_info *t = current_thread_info();
+	struct thread_info *t = task_thread_info(tsk);
 
 	if (t->utraps) {
 		if (t->utraps[0] < 2)
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index b5f30d376ce1..6b705ccc9cc1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -541,7 +541,7 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifdef CONFIG_HARDWALL
 	/*
@@ -550,7 +550,7 @@ void exit_thread(void)
 	 * the last reference to a hardwall fd, it would already have
 	 * been released and deactivated at this point.)
 	 */
-	hardwall_deactivate_all(current);
+	hardwall_deactivate_all(tsk);
 #endif
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2915d54e9dd5..96becbbb52e0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
+	struct thread_struct *t = &tsk->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
 	struct fpu *fpu = &t->fpu;
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 5bbfed81c97b..e0ded48561db 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -115,10 +115,10 @@ void arch_cpu_idle(void)
 /*
  * This is called when the thread calls exit().
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #if XTENSA_HAVE_COPROCESSORS
-	coprocessor_release_all(current_thread_info());
+	coprocessor_release_all(task_thread_info(tsk));
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 167c0d4bf3fa..02bdab4d6db7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2771,9 +2771,9 @@ static inline int copy_thread_tls(
 extern void flush_thread(void);
 
 #ifdef CONFIG_HAVE_EXIT_THREAD
-extern void exit_thread(void);
+extern void exit_thread(struct task_struct *tsk);
 #else
-static inline void exit_thread(void)
+static inline void exit_thread(struct task_struct *tsk)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..75b34fe835b2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -746,7 +746,7 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	exit_thread();
+	exit_thread(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
-- 
cgit v1.2.3


From 2eeed7e98d6a1341b1574893a95ce5b8379140f2 Mon Sep 17 00:00:00 2001
From: René Nyffenegger <mail@renenyffenegger.ch>
Date: Fri, 20 May 2016 17:00:30 -0700
Subject: include/linux/syscalls.h: use pid_t instead of int
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In include/linux/syscalls.h, the four functions sys_kill, sys_tgkill,
sys_tkill and sys_rt_sigqueueinfo are declared with "int pid" and "int
tgid".

However, in kernel/signal.c, the corresponding definitions use the more
appropriate "pid_t" (which is a typedef'd int).

This patch changes "int" to "pid_t" in the declarations of sys_kill,
sys_tgkill, sys_tkill and sys_rt_sigqueueinfo in <linux/syscalls.h> in
order to harmonize the function declarations with their respective
definitions.

Link: http://lkml.kernel.org/r/57302FDA.7020205@renenyffenegger.ch
Signed-off-by: René Nyffenegger <mail@renenyffenegger.ch>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Cc: Zach Brown <zab@redhat.com>
Cc: Milosz Tanski <milosz@adfin.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d795472c54d8..d02239022bd0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -371,10 +371,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				size_t sigsetsize);
 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
 		siginfo_t __user *uinfo);
-asmlinkage long sys_kill(int pid, int sig);
-asmlinkage long sys_tgkill(int tgid, int pid, int sig);
-asmlinkage long sys_tkill(int pid, int sig);
-asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
+asmlinkage long sys_kill(pid_t pid, int sig);
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig);
+asmlinkage long sys_tkill(pid_t pid, int sig);
+asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo);
 asmlinkage long sys_sgetmask(void);
 asmlinkage long sys_ssetmask(int newmask);
 asmlinkage long sys_signal(int sig, __sighandler_t handler);
-- 
cgit v1.2.3


From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:33 -0700
Subject: printk/nmi: generic solution for safe printk in NMI

printk() takes some locks and could not be used a safe way in NMI
context.

The chance of a deadlock is real especially when printing stacks from
all CPUs.  This particular problem has been addressed on x86 by the
commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all
CPUs").

The patchset brings two big advantages.  First, it makes the NMI
backtraces safe on all architectures for free.  Second, it makes all NMI
messages almost safe on all architectures (the temporary buffer is
limited.  We still should keep the number of messages in NMI context at
minimum).

Note that there already are several messages printed in NMI context:
WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE
handlers.  These are not easy to avoid.

This patch reuses most of the code and makes it generic.  It is useful
for all messages and architectures that support NMI.

The alternative printk_func is set when entering and is reseted when
leaving NMI context.  It queues IRQ work to copy the messages into the
main ring buffer in a safe context.

__printk_nmi_flush() copies all available messages and reset the buffer.
Then we could use a simple cmpxchg operations to get synchronized with
writers.  There is also used a spinlock to get synchronized with other
flushers.

We do not longer use seq_buf because it depends on external lock.  It
would be hard to make all supported operations safe for a lockless use.
It would be confusing and error prone to make only some operations safe.

The code is put into separate printk/nmi.c as suggested by Steven
Rostedt.  It needs a per-CPU buffer and is compiled only on
architectures that call nmi_enter().  This is achieved by the new
HAVE_NMI Kconfig flag.

The are MN10300 and Xtensa architectures.  We need to clean up NMI
handling there first.  Let's do it separately.

The patch is heavily based on the draft from Peter Zijlstra, see

  https://lkml.org/lkml/2015/6/10/327

[arnd@arndb.de: printk-nmi: use %zu format string for size_t]
[akpm@linux-foundation.org: min_t->min - all types are size_t here]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>	[arm part]
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |   4 +
 arch/arm/Kconfig              |   1 +
 arch/arm/kernel/smp.c         |   2 +
 arch/avr32/Kconfig            |   1 +
 arch/blackfin/Kconfig         |   1 +
 arch/cris/Kconfig             |   1 +
 arch/mips/Kconfig             |   1 +
 arch/powerpc/Kconfig          |   1 +
 arch/s390/Kconfig             |   1 +
 arch/sh/Kconfig               |   1 +
 arch/sparc/Kconfig            |   1 +
 arch/tile/Kconfig             |   1 +
 arch/x86/Kconfig              |   1 +
 arch/x86/kernel/apic/hw_nmi.c |   1 -
 include/linux/hardirq.h       |   2 +
 include/linux/percpu.h        |   3 -
 include/linux/printk.h        |  12 ++-
 init/Kconfig                  |   5 +
 init/main.c                   |   1 +
 kernel/printk/Makefile        |   1 +
 kernel/printk/internal.h      |  44 +++++++++
 kernel/printk/nmi.c           | 219 ++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c        |  19 +---
 lib/nmi_backtrace.c           |  89 +----------------
 24 files changed, 306 insertions(+), 107 deletions(-)
 create mode 100644 kernel/printk/internal.h
 create mode 100644 kernel/printk/nmi.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 0f298f9123dc..8f84fd268dee 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -187,7 +187,11 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_NMI
+	bool
+
 config HAVE_NMI_WATCHDOG
+	depends on HAVE_NMI
 	bool
 #
 # An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 956d3575426c..90542db1220d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -67,6 +67,7 @@ config ARM
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index baee70267f29..df90bc59bfce 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		break;
 
 	case IPI_CPU_BACKTRACE:
+		printk_nmi_enter();
 		irq_enter();
 		nmi_cpu_backtrace(regs);
 		irq_exit();
+		printk_nmi_exit();
 		break;
 
 	default:
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index e43519a2ca89..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -18,6 +18,7 @@ config AVR32
 	select GENERIC_CLOCKEVENTS
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select HAVE_NMI
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index a63c12259e77..28c63fea786d 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_NMI
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 5c0ca8ae9293..deba2662b9f3 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -70,6 +70,7 @@ config CRIS
 	select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
 	select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
+	select HAVE_NMI
 
 config HZ
 	int
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5663f411c225..8040fb1845b4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -48,6 +48,7 @@ config MIPS
 	select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC
 	select GENERIC_CMOS_UPDATE
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL if MODULES
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f0403b58ae8b..01f7464d9fea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
 	select NO_BOOTMEM
 	select HAVE_GENERIC_RCU_GUP
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select HAVE_NMI if PERF_EVENTS
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
 	select ARCH_HAS_DMA_SET_COHERENT_MASK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e2c9aaaf64b2..1c3c43d9d1b5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -166,6 +166,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
+	select HAVE_NMI
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index cb93af8f8017..f6254341c065 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_NMI
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 27b3a0ad40a0..1012f7ffcdf5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -79,6 +79,7 @@ config SPARC64
 	select NO_BOOTMEM
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select HAVE_NMI
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 174746225577..76989b878f3c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -30,6 +30,7 @@ config TILE
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_CONTEXT_TRACKING
+	select HAVE_NMI if USE_PMC
 	select EDAC_SUPPORT
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ff5b3be95d4..0a7b885964ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -131,6 +131,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
 	select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 045e424fb368..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/seq_buf.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..c683996110b1 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,7 @@ extern void irq_exit(void);
 
 #define nmi_enter()						\
 	do {							\
+		printk_nmi_enter();				\
 		lockdep_off();					\
 		ftrace_nmi_enter();				\
 		BUG_ON(in_nmi());				\
@@ -77,6 +78,7 @@ extern void irq_exit(void);
 		preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		ftrace_nmi_exit();				\
 		lockdep_on();					\
+		printk_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4bc6dafb703e..56939d3f6e53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9ccbdf2c1453..51dd6b824fe2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_nmi_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_nmi_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index 79a91a2c0444..bccc1d607be5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1454,6 +1454,11 @@ config PRINTK
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 
+config PRINTK_NMI
+	def_bool y
+	depends on PRINTK
+	depends on HAVE_NMI
+
 config BUG
 	bool "BUG() support" if EXPERT
 	default y
diff --git a/init/main.c b/init/main.c
index 2075fafaad59..fa9b2bdde183 100644
--- a/init/main.c
+++ b/init/main.c
@@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
+	printk_nmi_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
 obj-y	= printk.o
+obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..2de99faedfc1
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,44 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return vprintk_default(fmt, args);
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..303cf0d15e57
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,219 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+
+#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	if (len >= sizeof(s->buffer))
+		return 0;
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_nmi_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+	unsigned long flags;
+	size_t len, size;
+	int i, last_i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase.
+	 */
+	if (i && i >= len)
+		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+		       i, len);
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+
+	size = min(len, sizeof(s->buffer));
+	last_i = i;
+
+	/* Print line by line. */
+	for (; i < size; i++) {
+		if (s->buffer[i] == '\n') {
+			print_nmi_seq_line(s, last_i, i);
+			last_i = i + 1;
+		}
+	}
+	/* Check if there was a partial line. */
+	if (last_i < size) {
+		print_nmi_seq_line(s, last_i, size - 1);
+		pr_cont("\n");
+	}
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void __init printk_nmi_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_nmi_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_nmi_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..71eba0607034 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
 
 #include "console_cmdline.h"
 #include "braille.h"
+#include "internal.h"
 
 int console_printk[4] = {
 	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */
@@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
  */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
-	printk_func_t vprintk_func;
 	va_list args;
 	int r;
 
 	va_start(args, fmt);
-
-	/*
-	 * If a caller overrides the per_cpu printk_func, then it needs
-	 * to disable preemption when calling printk(). Otherwise
-	 * the printk_func should be set to the default. No need to
-	 * disable preemption here.
-	 */
-	vprintk_func = this_cpu_read(printk_func);
 	r = vprintk_func(fmt, args);
-
 	va_end(args);
 
 	return r;
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c669e..26caf51cc238 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
-#include <linux/seq_buf.h>
 
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
 
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
-}
-
 /*
  * When raise() is called it will be is passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
 void nmi_trigger_all_cpu_backtrace(bool include_self,
 				   void (*raise)(cpumask_t *mask))
 {
-	struct nmi_seq_buf *s;
-	int i, cpu, this_cpu = get_cpu();
+	int i, this_cpu = get_cpu();
 
 	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	if (!include_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
 		pr_info("Sending NMI to %s CPUs:\n",
 			(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	}
 
 	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
+	 * Force flush any remote buffers that might be stuck in IRQ context
+	 * and therefore could not run their irq_work.
 	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int len, last_i = 0;
+	printk_nmi_flush();
 
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
+	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
 }
 
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
-}
-
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
 		pr_warn("NMI backtrace for cpu %d\n", cpu);
 		if (regs)
 			show_regs(regs);
 		else
 			dump_stack();
-		this_cpu_write(printk_func, printk_func_save);
-
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
-- 
cgit v1.2.3


From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:42 -0700
Subject: printk/nmi: flush NMI messages on the system panic

In NMI context, printk() messages are stored into per-CPU buffers to
avoid a possible deadlock.  They are normally flushed to the main ring
buffer via an IRQ work.  But the work is never called when the system
calls panic() in the very same NMI handler.

This patch tries to flush NMI buffers before the crash dump is
generated.  In this case it does not risk a double release and bails out
when the logbuf_lock is already taken.  The aim is to get the messages
into the main ring buffer when possible.  It makes them better
accessible in the vmcore.

Then the patch tries to flush the buffers second time when other CPUs
are down.  It might be more aggressive and reset logbuf_lock.  The aim
is to get the messages available for the consequent kmsg_dump() and
console_flush_on_panic() calls.

The patch causes vprintk_emit() to be called even in NMI context again.
But it is done via printk_deferred() so that the console handling is
skipped.  Consoles use internal locks and we could not prevent a
deadlock easily.  They are explicitly called later when the crash dump
is not generated, see console_flush_on_panic().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   |  2 ++
 kernel/kexec_core.c      |  1 +
 kernel/panic.c           |  6 +++++-
 kernel/printk/internal.h |  2 ++
 kernel/printk/nmi.c      | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |  2 +-
 6 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 51dd6b824fe2..f4da695fd615 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -127,11 +127,13 @@ extern void printk_nmi_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
 extern void printk_nmi_flush(void);
+extern void printk_nmi_flush_on_panic(void);
 #else
 static inline void printk_nmi_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
 static inline void printk_nmi_flush(void) { }
+static inline void printk_nmi_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1c03dfb4abfd..d5d408252992 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
+		printk_nmi_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..8aa74497cc5a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -160,8 +160,10 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers)
+	if (!crash_kexec_post_notifiers) {
+		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
+	}
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +178,8 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	/* Call flush even twice. It tries harder with a single online CPU */
+	printk_nmi_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 341bedccc065..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
+extern raw_spinlock_t logbuf_lock;
+
 /*
  * printk() could not take logbuf_lock in NMI context. Instead,
  * it temporary stores the strings into a per-CPU buffer.
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bf08557d7e3d..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -17,6 +17,7 @@
 
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/debug_locks.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/irq_work.h>
@@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
 	const char *buf = s->buffer + start;
 
-	printk("%.*s", (end - start) + 1, buf);
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", (end - start) + 1, buf);
+	else
+		printk("%.*s", (end - start) + 1, buf);
+
 }
 
 /*
@@ -194,6 +204,33 @@ void printk_nmi_flush(void)
 		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
 }
 
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_nmi_flush();
+}
+
 void __init printk_nmi_init(void)
 {
 	int cpu;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e38579d730f4..60cdf6386763 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -245,7 +245,7 @@ __packed __aligned(4)
  * within the scheduler's rq lock. It must be released before calling
  * console_unlock() or anything else that might wake up a process.
  */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
-- 
cgit v1.2.3


From 8da4b8c48e7b43cb16d05e1dbb34ad9f73ab7efd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:00 -0700
Subject: lib/uuid.c: move generate_random_uuid() to uuid.c

Let's gather the UUID related functions under one hood.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/random.c  | 21 +--------------------
 fs/btrfs/volumes.c     |  2 +-
 fs/ext4/ioctl.c        |  2 +-
 fs/f2fs/file.c         |  2 +-
 fs/reiserfs/objectid.c |  2 +-
 fs/ubifs/sb.c          |  2 +-
 include/linux/random.h |  1 -
 include/linux/uuid.h   |  2 ++
 lib/uuid.c             | 20 ++++++++++++++++++++
 9 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index b583e5336630..0158d3bff7e5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -260,6 +260,7 @@
 #include <linux/irq.h>
 #include <linux/syscalls.h>
 #include <linux/completion.h>
+#include <linux/uuid.h>
 
 #include <asm/processor.h>
 #include <asm/uaccess.h>
@@ -1621,26 +1622,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	return urandom_read(NULL, buf, count, NULL);
 }
 
-/***************************************************************
- * Random UUID interface
- *
- * Used here for a Boot ID, but can be useful for other kernel
- * drivers.
- ***************************************************************/
-
-/*
- * Generate random UUID
- */
-void generate_random_uuid(unsigned char uuid_out[16])
-{
-	get_random_bytes(uuid_out, 16);
-	/* Set UUID version to 4 --- truly random generation */
-	uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40;
-	/* Set the UUID variant to DCE */
-	uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80;
-}
-EXPORT_SYMBOL(generate_random_uuid);
-
 /********************************************************************
  *
  * Sysctl interface
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bd0f45fb38c4..bfb80da3e6eb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,13 +20,13 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
-#include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
+#include <linux/uuid.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eae5917c534e..7497f50cb293 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -13,8 +13,8 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <linux/random.h>
 #include <linux/quotaops.h>
+#include <linux/uuid.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index eb9d027e5981..c6b14951bef3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,7 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/mount.h>
 #include <linux/pagevec.h>
-#include <linux/random.h>
+#include <linux/uuid.h>
 
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 99a5d5dae46a..415d66ca87d1 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -3,8 +3,8 @@
  */
 
 #include <linux/string.h>
-#include <linux/random.h>
 #include <linux/time.h>
+#include <linux/uuid.h>
 #include "reiserfs.h"
 
 /* find where objectid map starts */
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index f4fbc7b6b794..3cbb904a6d7d 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,8 +28,8 @@
 
 #include "ubifs.h"
 #include <linux/slab.h>
-#include <linux/random.h>
 #include <linux/math64.h>
+#include <linux/uuid.h>
 
 /*
  * Default journal size in logical eraseblocks as a percent of total
diff --git a/include/linux/random.h b/include/linux/random.h
index 9c29122037f9..e47e533742b5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -26,7 +26,6 @@ extern void get_random_bytes(void *buf, int nbytes);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern void get_random_bytes_arch(void *buf, int nbytes);
-void generate_random_uuid(unsigned char uuid_out[16]);
 extern int random_int_secret_init(void);
 
 #ifndef MODULE
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 6df2509033d7..91c2b6d9cbb7 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -33,6 +33,8 @@ static inline int uuid_be_cmp(const uuid_be u1, const uuid_be u2)
 	return memcmp(&u1, &u2, sizeof(uuid_be));
 }
 
+void generate_random_uuid(unsigned char uuid[16]);
+
 extern void uuid_le_gen(uuid_le *u);
 extern void uuid_be_gen(uuid_be *u);
 
diff --git a/lib/uuid.c b/lib/uuid.c
index 398821e4dce1..6c81c0b0467e 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -23,6 +23,26 @@
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+/***************************************************************
+ * Random UUID interface
+ *
+ * Used here for a Boot ID, but can be useful for other kernel
+ * drivers.
+ ***************************************************************/
+
+/*
+ * Generate random UUID
+ */
+void generate_random_uuid(unsigned char uuid[16])
+{
+	get_random_bytes(uuid, 16);
+	/* Set UUID version to 4 --- truly random generation */
+	uuid[6] = (uuid[6] & 0x0F) | 0x40;
+	/* Set the UUID variant to DCE */
+	uuid[8] = (uuid[8] & 0x3F) | 0x80;
+}
+EXPORT_SYMBOL(generate_random_uuid);
+
 static void __uuid_gen_common(__u8 b[16])
 {
 	prandom_bytes(b, 16);
-- 
cgit v1.2.3


From 2b1b0d66704a8cafe83be7114ec4c15ab3a314ad Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:04 -0700
Subject: lib/uuid.c: introduce a few more generic helpers

There are new helpers in this patch:

  uuid_is_valid		checks if a UUID is valid
  uuid_be_to_bin	converts from string to binary (big endian)
  uuid_le_to_bin	converts from string to binary (little endian)

They will be used in future, i.e. in the following patches in the series.

This also moves the indices arrays to lib/uuid.c to be shared accross
modules.

[andriy.shevchenko@linux.intel.com: fix typo]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h | 13 +++++++++++
 lib/uuid.c           | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/vsprintf.c       |  9 ++++----
 3 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 91c2b6d9cbb7..e0b95e728a77 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -22,6 +22,11 @@
 
 #include <uapi/linux/uuid.h>
 
+/*
+ * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
+ * not including trailing NUL.
+ */
+#define	UUID_STRING_LEN		36
 
 static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
 {
@@ -38,4 +43,12 @@ void generate_random_uuid(unsigned char uuid[16]);
 extern void uuid_le_gen(uuid_le *u);
 extern void uuid_be_gen(uuid_be *u);
 
+bool __must_check uuid_is_valid(const char *uuid);
+
+extern const u8 uuid_le_index[16];
+extern const u8 uuid_be_index[16];
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u);
+int uuid_be_to_bin(const char *uuid, uuid_be *u);
+
 #endif
diff --git a/lib/uuid.c b/lib/uuid.c
index 6c81c0b0467e..82787f652fbc 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -19,10 +19,17 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_le_index);
+const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_be_index);
+
 /***************************************************************
  * Random UUID interface
  *
@@ -65,3 +72,61 @@ void uuid_be_gen(uuid_be *bu)
 	bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
 }
 EXPORT_SYMBOL_GPL(uuid_be_gen);
+
+/**
+  * uuid_is_valid - checks if UUID string valid
+  * @uuid:	UUID string to check
+  *
+  * Description:
+  * It checks if the UUID string is following the format:
+  *	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  * where x is a hex digit.
+  *
+  * Return: true if input is valid UUID string.
+  */
+bool uuid_is_valid(const char *uuid)
+{
+	unsigned int i;
+
+	for (i = 0; i < UUID_STRING_LEN; i++) {
+		if (i == 8 || i == 13 || i == 18 || i == 23) {
+			if (uuid[i] != '-')
+				return false;
+		} else if (!isxdigit(uuid[i])) {
+			return false;
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(uuid_is_valid);
+
+static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+{
+	static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
+	unsigned int i;
+
+	if (!uuid_is_valid(uuid))
+		return -EINVAL;
+
+	for (i = 0; i < 16; i++) {
+		int hi = hex_to_bin(uuid[si[i]] + 0);
+		int lo = hex_to_bin(uuid[si[i]] + 1);
+
+		b[ei[i]] = (hi << 4) | lo;
+	}
+
+	return 0;
+}
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_le_index);
+}
+EXPORT_SYMBOL(uuid_le_to_bin);
+
+int uuid_be_to_bin(const char *uuid, uuid_be *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_be_index);
+}
+EXPORT_SYMBOL(uuid_be_to_bin);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index be0e7cf11e48..0967771d8f7f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -30,6 +30,7 @@
 #include <linux/ioport.h>
 #include <linux/dcache.h>
 #include <linux/cred.h>
+#include <linux/uuid.h>
 #include <net/addrconf.h>
 #ifdef CONFIG_BLOCK
 #include <linux/blkdev.h>
@@ -1304,19 +1305,17 @@ static noinline_for_stack
 char *uuid_string(char *buf, char *end, const u8 *addr,
 		  struct printf_spec spec, const char *fmt)
 {
-	char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+	char uuid[UUID_STRING_LEN + 1];
 	char *p = uuid;
 	int i;
-	static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-	static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
-	const u8 *index = be;
+	const u8 *index = uuid_be_index;
 	bool uc = false;
 
 	switch (*(++fmt)) {
 	case 'L':
 		uc = true;		/* fall-through */
 	case 'l':
-		index = le;
+		index = uuid_le_index;
 		break;
 	case 'B':
 		uc = true;
-- 
cgit v1.2.3


From e3a93bce69ad3e2c38927abe311b8cb4f17abbaf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:07 -0700
Subject: lib/uuid.c: remove FSF address

There is no point in keeping an address in the file since it's subject
to change.

While here, update Intel Copyright years.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h      | 6 +-----
 include/uapi/linux/uuid.h | 4 ----
 lib/uuid.c                | 6 +-----
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index e0b95e728a77..2d095fc60204 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -1,7 +1,7 @@
 /*
  * UUID/GUID definition
  *
- * Copyright (C) 2010, Intel Corp.
+ * Copyright (C) 2010, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #ifndef _LINUX_UUID_H_
 #define _LINUX_UUID_H_
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 786f0773cc33..3738e5fb6a4d 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _UAPI_LINUX_UUID_H_
diff --git a/lib/uuid.c b/lib/uuid.c
index 82787f652fbc..e116ae5fa00f 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -1,7 +1,7 @@
 /*
  * Unified UUID/GUID definition
  *
- * Copyright (C) 2009, Intel Corp.
+ * Copyright (C) 2009, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From ba7e34b1bbd2722685bbc75d168672d5154d8614 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:18 -0700
Subject: include/linux/efi.h: redefine type, constant, macro from generic code

Generic UUID library defines structure type, macro to define UUID, and
the length of the UUID string.  This patch removes duplicate data
structure definition, UUID string length constant as well as macro for
UUID handling.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/efi.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index df7acb51f3cc..c2db3ca22217 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -21,6 +21,7 @@
 #include <linux/pfn.h>
 #include <linux/pstore.h>
 #include <linux/reboot.h>
+#include <linux/uuid.h>
 #include <linux/screen_info.h>
 
 #include <asm/page.h>
@@ -44,17 +45,10 @@ typedef u16 efi_char16_t;		/* UNICODE character */
 typedef u64 efi_physical_addr_t;
 typedef void *efi_handle_t;
 
-
-typedef struct {
-	u8 b[16];
-} efi_guid_t;
+typedef uuid_le efi_guid_t;
 
 #define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-((efi_guid_t) \
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
-  (b) & 0xff, ((b) >> 8) & 0xff, \
-  (c) & 0xff, ((c) >> 8) & 0xff, \
-  (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+	UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
 
 /*
  * Generic EFI table header
@@ -1117,7 +1111,7 @@ extern int efi_status_to_err(efi_status_t status);
  * Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))
  * not including trailing NUL
  */
-#define EFI_VARIABLE_GUID_LEN 36
+#define EFI_VARIABLE_GUID_LEN	UUID_STRING_LEN
 
 /*
  * The type of search to perform when calling boottime->locate_handle
-- 
cgit v1.2.3


From 63579785752ba7d0e842078ec6b2875367046f06 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:24 -0700
Subject: include/linux/genhd.h: move to use generic UUID library

UUID library provides uuid_be type and uuid_be_to_bin() function.  This
substitutes open coded variant by generic library calls.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genhd.h | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c706765404a..359a8e4bd44d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
+#include <linux/uuid.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -93,7 +94,7 @@ struct disk_stats {
  * Enough for the string representation of any kind of UUID plus NULL.
  * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
  */
-#define PARTITION_META_INFO_UUIDLTH	37
+#define PARTITION_META_INFO_UUIDLTH	(UUID_STRING_LEN + 1)
 
 struct partition_meta_info {
 	char uuid[PARTITION_META_INFO_UUIDLTH];
@@ -228,27 +229,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
-static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
-{
-	int i;
-	for (i = 0; i < 16; ++i) {
-		*to++ = (hex_to_bin(*uuid_str) << 4) |
-			(hex_to_bin(*(uuid_str + 1)));
-		uuid_str += 2;
-		switch (i) {
-		case 3:
-		case 5:
-		case 7:
-		case 9:
-			uuid_str++;
-			continue;
-		}
-	}
-}
-
 static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to)
 {
-	part_pack_uuid(uuid_str, to);
+	uuid_be_to_bin(uuid_str, (uuid_be *)to);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:01:33 -0700
Subject: radix-tree: introduce radix_tree_empty

Commit e61452365372 ("radix_tree: add support for multi-order entries")
left the impression that the support for multiorder radix tree entries
was functional.  As soon as Ross tried to use it, it became apparent
that my testing was completely inadequate, and it didn't even work a
little bit for orders that were not a multiple of shift.

This series of patches is the result of about 6 weeks of redesign,
reimplementation, testing, arguing and hair-pulling.  The great news is
that the test-suite is now far better than it was.  That's reflected in
the diffstat for the test-suite alone:

 12 files changed, 436 insertions(+), 28 deletions(-)

The highlight for users of the tree is that the restriction on the order
of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix
tree now supports any order between 0 and 64.

For those who are interested in how the tree works, patch 9 is probably
the most interesting one as it introduces the new machinery for handling
sibling entries.

I've tried to be fair in attributing authorship to the person who
contributed the majority of the code in each patch; Ross has been an
invaluable partner in the development of this support and it's fair to
say that each of us has code in every commit.

I should also express my appreciation of the 0day testing.  It prompted
me that I was bloating the tinyconfig in an unacceptable way, and it
bisected to a commit which contained a rather nasty memory-corruption
bug.

This patch (of 29):

The irqdomain code was checking for 0 or 1 entries, not 0 entries like
the comment said they were.  Introduce a new helper that will actually
check for an empty tree.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 5 +++++
 kernel/irq/irqdomain.c     | 7 +------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..83f708e5db59 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -136,6 +136,11 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
+static inline bool radix_tree_empty(struct radix_tree_root *root)
+{
+	return root->rnode == NULL;
+}
+
 /**
  * Radix-tree synchronization
  *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d65f6f31a5b3..8798b6c9e945 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain)
 {
 	mutex_lock(&irq_domain_mutex);
 
-	/*
-	 * radix_tree_delete() takes care of destroying the root
-	 * node when all entries are removed. Shout if there are
-	 * any mappings left.
-	 */
-	WARN_ON(domain->revmap_tree.height);
+	WARN_ON(!radix_tree_empty(&domain->revmap_tree));
 
 	list_del(&domain->link);
 
-- 
cgit v1.2.3


From 97d778b2de9213c7a7483dad0f533c1af9f0810f Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:01:42 -0700
Subject: radix tree test suite: allow testing other fan-out values

The defines in regression2.c are already in radix-tree.h and duplicating
them in the test case makes experimenting with other values for the
fan-out harder than necessary.  Allow the user of the radix tree to
decide what the fan-out should be rather than fixing it to 8 for
non-kernel uses.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h              | 4 +---
 tools/testing/radix-tree/linux/kernel.h | 2 ++
 tools/testing/radix-tree/regression2.c  | 7 -------
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 83f708e5db59..5ce5a1e0ecc5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -70,10 +70,8 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 #define RADIX_TREE_MAX_TAGS 3
 
-#ifdef __KERNEL__
+#ifndef RADIX_TREE_MAP_SHIFT
 #define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
 #endif
 
 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 76a88f35fdc4..31fe2c77d7ae 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -12,6 +12,8 @@
 #define CONFIG_SHMEM
 #define CONFIG_SWAP
 
+#define RADIX_TREE_MAP_SHIFT	3
+
 #ifndef NULL
 #define NULL	0
 #endif
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 5d2fa28cdca3..63bf347aaf33 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -51,13 +51,6 @@
 
 #include "regression.h"
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
 #define PAGECACHE_TAG_DIRTY     0
 #define PAGECACHE_TAG_WRITEBACK 1
 #define PAGECACHE_TAG_TOWRITE   2
-- 
cgit v1.2.3


From 6c4bd68a2962c03423a226d949caf64216d013cc Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:01:51 -0700
Subject: radix-tree: remove unused looping macros

radix_tree_for_each_chunk() and radix_tree_for_each_chunk_slot() have
never been used in the kernel since their introduction in 2012, so
remove them.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 5ce5a1e0ecc5..e1512a607709 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -478,34 +478,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 	return NULL;
 }
 
-/**
- * radix_tree_for_each_chunk - iterate over chunks
- *
- * @slot:	the void** variable for pointer to chunk first slot
- * @root:	the struct radix_tree_root pointer
- * @iter:	the struct radix_tree_iter pointer
- * @start:	iteration starting index
- * @flags:	RADIX_TREE_ITER_* and tag index
- *
- * Locks can be released and reacquired between iterations.
- */
-#define radix_tree_for_each_chunk(slot, root, iter, start, flags)	\
-	for (slot = radix_tree_iter_init(iter, start) ;			\
-	      (slot = radix_tree_next_chunk(root, iter, flags)) ;)
-
-/**
- * radix_tree_for_each_chunk_slot - iterate over slots in one chunk
- *
- * @slot:	the void** variable, at the beginning points to chunk first slot
- * @iter:	the struct radix_tree_iter pointer
- * @flags:	RADIX_TREE_ITER_*, should be constant
- *
- * This macro is designed to be nested inside radix_tree_for_each_chunk().
- * @slot points to the radix tree slot, @iter->index contains its index.
- */
-#define radix_tree_for_each_chunk_slot(slot, iter, flags)		\
-	for (; slot ; slot = radix_tree_next_slot(slot, iter, flags))
-
 /**
  * radix_tree_for_each_slot - iterate over non-empty slots
  *
-- 
cgit v1.2.3


From 21ef533931f73a8e963a6107aa5ec51b192f28be Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:02:26 -0700
Subject: radix-tree: add support for multi-order iterating

This enables the macros radix_tree_for_each_slot() and friends to be
used with multi-order entries.

The way that this works is that we treat all entries in a given slots[]
array as a single chunk.  If the index given to radix_tree_next_chunk()
happens to point us to a sibling entry, we will back up iter->index so
that it points to the canonical entry, and that will be the place where
we start our iteration.

As we're processing a chunk in radix_tree_next_slot(), we process
canonical entries, skip over sibling entries, and restart the chunk
lookup if we find a non-sibling indirect pointer.  This drops back to
the radix_tree_next_chunk() code, which will re-walk the tree and look
for another chunk.

This allows us to properly handle multi-order entries mixed with other
entries that are at various heights in the radix tree.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h                    | 69 +++++++++++++++++++++++----
 lib/radix-tree.c                              | 66 ++++++++++++++-----------
 tools/testing/radix-tree/generated/autoconf.h |  3 ++
 tools/testing/radix-tree/linux/kernel.h       |  5 +-
 4 files changed, 102 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/radix-tree/generated/autoconf.h

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index e1512a607709..8558d52e1c7b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -330,8 +330,9 @@ static inline void radix_tree_preload_end(void)
  * struct radix_tree_iter - radix tree iterator state
  *
  * @index:	index of current slot
- * @next_index:	next-to-last index for this chunk
+ * @next_index:	one beyond the last index for this chunk
  * @tags:	bit-mask for tag-iterating
+ * @shift:	shift for the node that holds our slots
  *
  * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
  * subinterval of slots contained within one radix tree leaf node.  It is
@@ -344,8 +345,20 @@ struct radix_tree_iter {
 	unsigned long	index;
 	unsigned long	next_index;
 	unsigned long	tags;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	unsigned int	shift;
+#endif
 };
 
+static inline unsigned int iter_shift(struct radix_tree_iter *iter)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	return iter->shift;
+#else
+	return 0;
+#endif
+}
+
 #define RADIX_TREE_ITER_TAG_MASK	0x00FF	/* tag index in lower byte */
 #define RADIX_TREE_ITER_TAGGED		0x0100	/* lookup tagged slots */
 #define RADIX_TREE_ITER_CONTIG		0x0200	/* stop at first hole */
@@ -405,6 +418,12 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
 	return NULL;
 }
 
+static inline unsigned long
+__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
+{
+	return iter->index + (slots << iter_shift(iter));
+}
+
 /**
  * radix_tree_iter_next - resume iterating when the chunk may be invalid
  * @iter:	iterator state
@@ -416,7 +435,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
 static inline __must_check
 void **radix_tree_iter_next(struct radix_tree_iter *iter)
 {
-	iter->next_index = iter->index + 1;
+	iter->next_index = __radix_tree_iter_add(iter, 1);
 	iter->tags = 0;
 	return NULL;
 }
@@ -430,7 +449,12 @@ void **radix_tree_iter_next(struct radix_tree_iter *iter)
 static __always_inline long
 radix_tree_chunk_size(struct radix_tree_iter *iter)
 {
-	return iter->next_index - iter->index;
+	return (iter->next_index - iter->index) >> iter_shift(iter);
+}
+
+static inline void *indirect_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
 }
 
 /**
@@ -448,24 +472,51 @@ static __always_inline void **
 radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 {
 	if (flags & RADIX_TREE_ITER_TAGGED) {
+		void *canon = slot;
+
 		iter->tags >>= 1;
+		if (unlikely(!iter->tags))
+			return NULL;
+		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+					radix_tree_is_indirect_ptr(slot[1])) {
+			if (indirect_to_ptr(slot[1]) == canon) {
+				iter->tags >>= 1;
+				iter->index = __radix_tree_iter_add(iter, 1);
+				slot++;
+				continue;
+			}
+			iter->next_index = __radix_tree_iter_add(iter, 1);
+			return NULL;
+		}
 		if (likely(iter->tags & 1ul)) {
-			iter->index++;
+			iter->index = __radix_tree_iter_add(iter, 1);
 			return slot + 1;
 		}
-		if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) {
+		if (!(flags & RADIX_TREE_ITER_CONTIG)) {
 			unsigned offset = __ffs(iter->tags);
 
 			iter->tags >>= offset;
-			iter->index += offset + 1;
+			iter->index = __radix_tree_iter_add(iter, offset + 1);
 			return slot + offset + 1;
 		}
 	} else {
-		long size = radix_tree_chunk_size(iter);
+		long count = radix_tree_chunk_size(iter);
+		void *canon = slot;
 
-		while (--size > 0) {
+		while (--count > 0) {
 			slot++;
-			iter->index++;
+			iter->index = __radix_tree_iter_add(iter, 1);
+
+			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+			    radix_tree_is_indirect_ptr(*slot)) {
+				if (indirect_to_ptr(*slot) == canon)
+					continue;
+				else {
+					iter->next_index = iter->index;
+					break;
+				}
+			}
+
 			if (likely(*slot))
 				return slot;
 			if (flags & RADIX_TREE_ITER_CONTIG) {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index ff460423ff4b..a4da86e40def 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -75,11 +75,6 @@ static inline void *ptr_to_indirect(void *ptr)
 	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
 }
 
-static inline void *indirect_to_ptr(void *ptr)
-{
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
-}
-
 #define RADIX_TREE_RETRY	ptr_to_indirect(NULL)
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
@@ -885,6 +880,14 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+static inline void __set_iter_shift(struct radix_tree_iter *iter,
+					unsigned int shift)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	iter->shift = shift;
+#endif
+}
+
 /**
  * radix_tree_next_chunk - find next chunk of slots for iteration
  *
@@ -898,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 {
 	unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
 	struct radix_tree_node *rnode, *node;
-	unsigned long index, offset, height;
+	unsigned long index, offset, maxindex;
 
 	if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
 		return NULL;
@@ -916,33 +919,39 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (!index && iter->index)
 		return NULL;
 
-	rnode = rcu_dereference_raw(root->rnode);
+ restart:
+	shift = radix_tree_load_root(root, &rnode, &maxindex);
+	if (index > maxindex)
+		return NULL;
+
 	if (radix_tree_is_indirect_ptr(rnode)) {
 		rnode = indirect_to_ptr(rnode);
-	} else if (rnode && !index) {
+	} else if (rnode) {
 		/* Single-slot tree */
-		iter->index = 0;
-		iter->next_index = 1;
+		iter->index = index;
+		iter->next_index = maxindex + 1;
 		iter->tags = 1;
+		__set_iter_shift(iter, shift);
 		return (void **)&root->rnode;
 	} else
 		return NULL;
 
-restart:
-	height = rnode->path & RADIX_TREE_HEIGHT_MASK;
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	shift -= RADIX_TREE_MAP_SHIFT;
 	offset = index >> shift;
 
-	/* Index outside of the tree */
-	if (offset >= RADIX_TREE_MAP_SIZE)
-		return NULL;
-
 	node = rnode;
 	while (1) {
 		struct radix_tree_node *slot;
+		unsigned new_off = radix_tree_descend(node, &slot, offset);
+
+		if (new_off < offset) {
+			offset = new_off;
+			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
+			index |= offset << shift;
+		}
+
 		if ((flags & RADIX_TREE_ITER_TAGGED) ?
-				!test_bit(offset, node->tags[tag]) :
-				!node->slots[offset]) {
+				!tag_get(node, tag, offset) : !slot) {
 			/* Hole detected */
 			if (flags & RADIX_TREE_ITER_CONTIG)
 				return NULL;
@@ -954,7 +963,10 @@ restart:
 						offset + 1);
 			else
 				while (++offset	< RADIX_TREE_MAP_SIZE) {
-					if (node->slots[offset])
+					void *slot = node->slots[offset];
+					if (is_sibling_entry(node, slot))
+						continue;
+					if (slot)
 						break;
 				}
 			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
@@ -964,25 +976,23 @@ restart:
 				return NULL;
 			if (offset == RADIX_TREE_MAP_SIZE)
 				goto restart;
+			slot = rcu_dereference_raw(node->slots[offset]);
 		}
 
-		/* This is leaf-node */
-		if (!shift)
-			break;
-
-		slot = rcu_dereference_raw(node->slots[offset]);
-		if (slot == NULL)
+		if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
 			goto restart;
 		if (!radix_tree_is_indirect_ptr(slot))
 			break;
+
 		node = indirect_to_ptr(slot);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 	}
 
 	/* Update the iterator state */
-	iter->index = index;
-	iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+	iter->index = index & ~((1 << shift) - 1);
+	iter->next_index = (index | ((RADIX_TREE_MAP_SIZE << shift) - 1)) + 1;
+	__set_iter_shift(iter, shift);
 
 	/* Construct iter->tags bit-mask from node->tags[tag] array */
 	if (flags & RADIX_TREE_ITER_TAGGED) {
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
new file mode 100644
index 000000000000..ad18cf5a2a3a
--- /dev/null
+++ b/tools/testing/radix-tree/generated/autoconf.h
@@ -0,0 +1,3 @@
+#define CONFIG_RADIX_TREE_MULTIORDER 1
+#define CONFIG_SHMEM 1
+#define CONFIG_SWAP 1
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 8ea0ed450810..be98a47b4e1b 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -8,10 +8,7 @@
 #include <limits.h>
 
 #include "../../include/linux/compiler.h"
-
-#define CONFIG_RADIX_TREE_MULTIORDER
-#define CONFIG_SHMEM
-#define CONFIG_SWAP
+#include "../../../include/linux/kconfig.h"
 
 #define RADIX_TREE_MAP_SHIFT	3
 
-- 
cgit v1.2.3


From 0c7fa0a8418cbe0e8963fe36db9575d03b8589f7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:07 -0700
Subject: radix-tree: split node->path into offset and height

Neither piece of information we're storing in node->path can be larger
than 64, so store each in its own unsigned char instead of shifting and
masking to store them both in an unsigned int.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  7 ++-----
 lib/radix-tree.c           | 38 +++++++++++++++++---------------------
 2 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 8558d52e1c7b..2d2ad9d685a3 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -84,16 +84,13 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 					  RADIX_TREE_MAP_SHIFT))
 
-/* Height component in node->path */
-#define RADIX_TREE_HEIGHT_SHIFT	(RADIX_TREE_MAX_PATH + 1)
-#define RADIX_TREE_HEIGHT_MASK	((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
-
 /* Internally used bits of node->count */
 #define RADIX_TREE_COUNT_SHIFT	(RADIX_TREE_MAP_SHIFT + 1)
 #define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
 
 struct radix_tree_node {
-	unsigned int	path;	/* Offset in parent & height from the bottom */
+	unsigned char	height;	/* From the bottom */
+	unsigned char	offset;	/* Slot offset in parent */
 	unsigned int	count;
 	union {
 		struct {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 75944e42e4a0..dd04b51e5fbb 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -218,15 +218,15 @@ radix_tree_find_next_bit(const unsigned long *addr,
 }
 
 #ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node, unsigned offset,
+static void dump_node(struct radix_tree_node *node,
 				unsigned shift, unsigned long index)
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
-		node, offset,
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n",
+		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->path, node->count, node->parent);
+		node->height, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << shift);
@@ -243,7 +243,7 @@ static void dump_node(struct radix_tree_node *node, unsigned offset,
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry), i,
+			dump_node(indirect_to_ptr(entry),
 					shift - RADIX_TREE_MAP_SHIFT, first);
 		}
 	}
@@ -257,7 +257,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode), 0,
+	dump_node(indirect_to_ptr(root->rnode),
 				(root->height - 1) * RADIX_TREE_MAP_SHIFT, 0);
 }
 #endif
@@ -421,7 +421,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
 
 static inline unsigned long node_maxindex(struct radix_tree_node *node)
 {
-	return radix_tree_maxindex(node->path & RADIX_TREE_HEIGHT_MASK);
+	return radix_tree_maxindex(node->height);
 }
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
@@ -434,8 +434,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	if (likely(radix_tree_is_indirect_ptr(node))) {
 		node = indirect_to_ptr(node);
 		*maxindex = node_maxindex(node);
-		return (node->path & RADIX_TREE_HEIGHT_MASK) *
-			RADIX_TREE_MAP_SHIFT;
+		return node->height * RADIX_TREE_MAP_SHIFT;
 	}
 
 	*maxindex = 0;
@@ -476,9 +475,10 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		/* Increase the height.  */
-		newheight = root->height+1;
-		BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
-		node->path = newheight;
+		newheight = root->height + 1;
+		BUG_ON(newheight > BITS_PER_LONG);
+		node->height = newheight;
+		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
 		slot = root->rnode;
@@ -546,13 +546,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			slot = radix_tree_node_alloc(root);
 			if (!slot)
 				return -ENOMEM;
-			slot->path = height;
+			slot->height = height;
+			slot->offset = offset;
 			slot->parent = node;
 			if (node) {
 				rcu_assign_pointer(node->slots[offset],
 							ptr_to_indirect(slot));
 				node->count++;
-				slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
 			} else
 				rcu_assign_pointer(root->rnode,
 							ptr_to_indirect(slot));
@@ -1319,11 +1319,10 @@ struct locate_info {
 static unsigned long __locate(struct radix_tree_node *slot, void *item,
 			      unsigned long index, struct locate_info *info)
 {
-	unsigned int shift, height;
+	unsigned int shift;
 	unsigned long i;
 
-	height = slot->path & RADIX_TREE_HEIGHT_MASK;
-	shift = height * RADIX_TREE_MAP_SHIFT;
+	shift = slot->height * RADIX_TREE_MAP_SHIFT;
 
 	do {
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -1508,10 +1507,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 
 		parent = node->parent;
 		if (parent) {
-			unsigned int offset;
-
-			offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
-			parent->slots[offset] = NULL;
+			parent->slots[node->offset] = NULL;
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-- 
cgit v1.2.3


From c12e51b07b3ac4c188fd91a82f96840fdb9cca6f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:10 -0700
Subject: radix-tree: replace node->height with node->shift

node->shift represents the shift necessary for looking in the slots
array at this level.  It is equal to the old (node->height - 1) *
RADIX_TREE_MAP_SHIFT.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  2 +-
 lib/radix-tree.c           | 30 ++++++++++++++++--------------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 2d2ad9d685a3..037458257e12 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -89,7 +89,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
 
 struct radix_tree_node {
-	unsigned char	height;	/* From the bottom */
+	unsigned char	shift;	/* Bits remaining in each slot */
 	unsigned char	offset;	/* Slot offset in parent */
 	unsigned int	count;
 	union {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index dd04b51e5fbb..648da9080418 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -223,10 +223,10 @@ static void dump_node(struct radix_tree_node *node,
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n",
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
 		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->height, node->count, node->parent);
+		node->shift, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << shift);
@@ -419,9 +419,14 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
 	return height_to_maxindex[height];
 }
 
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+	return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
 static inline unsigned long node_maxindex(struct radix_tree_node *node)
 {
-	return radix_tree_maxindex(node->height);
+	return shift_maxindex(node->shift);
 }
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
@@ -434,7 +439,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	if (likely(radix_tree_is_indirect_ptr(node))) {
 		node = indirect_to_ptr(node);
 		*maxindex = node_maxindex(node);
-		return node->height * RADIX_TREE_MAP_SHIFT;
+		return node->shift + RADIX_TREE_MAP_SHIFT;
 	}
 
 	*maxindex = 0;
@@ -475,9 +480,9 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		/* Increase the height.  */
-		newheight = root->height + 1;
+		newheight = root->height;
 		BUG_ON(newheight > BITS_PER_LONG);
-		node->height = newheight;
+		node->shift = newheight * RADIX_TREE_MAP_SHIFT;
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
@@ -490,7 +495,7 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->slots[0] = slot;
 		node = ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
-		root->height = newheight;
+		root->height = ++newheight;
 	} while (height > root->height);
 out:
 	return height * RADIX_TREE_MAP_SHIFT;
@@ -519,7 +524,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 {
 	struct radix_tree_node *node = NULL, *slot;
 	unsigned long maxindex;
-	unsigned int height, shift, offset;
+	unsigned int shift, offset;
 	unsigned long max = index | ((1UL << order) - 1);
 
 	shift = radix_tree_load_root(root, &slot, &maxindex);
@@ -537,16 +542,15 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		}
 	}
 
-	height = root->height;
-
 	offset = 0;			/* uninitialised var warning */
 	while (shift > order) {
+		shift -= RADIX_TREE_MAP_SHIFT;
 		if (slot == NULL) {
 			/* Have to add a child node.  */
 			slot = radix_tree_node_alloc(root);
 			if (!slot)
 				return -ENOMEM;
-			slot->height = height;
+			slot->shift = shift;
 			slot->offset = offset;
 			slot->parent = node;
 			if (node) {
@@ -560,8 +564,6 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			break;
 
 		/* Go a level down */
-		height--;
-		shift -= RADIX_TREE_MAP_SHIFT;
 		node = indirect_to_ptr(slot);
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(node, &slot, offset);
@@ -1322,7 +1324,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 	unsigned int shift;
 	unsigned long i;
 
-	shift = slot->height * RADIX_TREE_MAP_SHIFT;
+	shift = slot->shift + RADIX_TREE_MAP_SHIFT;
 
 	do {
 		shift -= RADIX_TREE_MAP_SHIFT;
-- 
cgit v1.2.3


From d0891265bbc988dc91ed8580b38eb3dac128581b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:19 -0700
Subject: radix-tree: remove root->height

The only remaining references to root->height were in extend and shrink,
where it was updated.  Now we can remove it entirely.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |   3 --
 lib/radix-tree.c           | 106 +++++++++++++--------------------------------
 2 files changed, 31 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 037458257e12..c0d223cfac00 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -110,13 +110,11 @@ struct radix_tree_node {
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
-	unsigned int		height;
 	gfp_t			gfp_mask;
 	struct radix_tree_node	__rcu *rnode;
 };
 
 #define RADIX_TREE_INIT(mask)	{					\
-	.height = 0,							\
 	.gfp_mask = (mask),						\
 	.rnode = NULL,							\
 }
@@ -126,7 +124,6 @@ struct radix_tree_root {
 
 #define INIT_RADIX_TREE(root, mask)					\
 do {									\
-	(root)->height = 0;						\
 	(root)->gfp_mask = (mask);					\
 	(root)->rnode = NULL;						\
 } while (0)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 75c9e6197b5b..58f79fee8c71 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,12 +38,6 @@
 #include <linux/preempt.h>		/* in_interrupt() */
 
 
-/*
- * The height_to_maxindex array needs to be one deeper than the maximum
- * path as height 0 holds only 1 entry.
- */
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
 /*
  * Radix tree node cache.
  */
@@ -218,8 +212,7 @@ radix_tree_find_next_bit(const unsigned long *addr,
 }
 
 #ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node,
-				unsigned shift, unsigned long index)
+static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
 	unsigned long i;
 
@@ -229,8 +222,8 @@ static void dump_node(struct radix_tree_node *node,
 		node->shift, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-		unsigned long first = index | (i << shift);
-		unsigned long last = first | ((1UL << shift) - 1);
+		unsigned long first = index | (i << node->shift);
+		unsigned long last = first | ((1UL << node->shift) - 1);
 		void *entry = node->slots[i];
 		if (!entry)
 			continue;
@@ -243,8 +236,7 @@ static void dump_node(struct radix_tree_node *node,
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry),
-					shift - RADIX_TREE_MAP_SHIFT, first);
+			dump_node(indirect_to_ptr(entry), first);
 		}
 	}
 }
@@ -252,13 +244,12 @@ static void dump_node(struct radix_tree_node *node,
 /* For debug */
 static void radix_tree_dump(struct radix_tree_root *root)
 {
-	pr_debug("radix root: %p height %d rnode %p tags %x\n",
-			root, root->height, root->rnode,
+	pr_debug("radix root: %p rnode %p tags %x\n",
+			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode),
-				(root->height - 1) * RADIX_TREE_MAP_SHIFT, 0);
+	dump_node(indirect_to_ptr(root->rnode), 0);
 }
 #endif
 
@@ -411,14 +402,8 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 EXPORT_SYMBOL(radix_tree_maybe_preload);
 
 /*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
+ * The maximum index which can be stored in a radix tree
  */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
-{
-	return height_to_maxindex[height];
-}
-
 static inline unsigned long shift_maxindex(unsigned int shift)
 {
 	return (RADIX_TREE_MAP_SIZE << shift) - 1;
@@ -450,24 +435,22 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
  *	Extend a radix tree so it can store key @index.
  */
 static int radix_tree_extend(struct radix_tree_root *root,
-				unsigned long index)
+				unsigned long index, unsigned int shift)
 {
 	struct radix_tree_node *slot;
-	unsigned int height;
+	unsigned int maxshift;
 	int tag;
 
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
+	/* Figure out what the shift should be.  */
+	maxshift = shift;
+	while (index > shift_maxindex(maxshift))
+		maxshift += RADIX_TREE_MAP_SHIFT;
 
-	if (root->rnode == NULL) {
-		root->height = height;
+	slot = root->rnode;
+	if (!slot)
 		goto out;
-	}
 
 	do {
-		unsigned int newheight;
 		struct radix_tree_node *node = radix_tree_node_alloc(root);
 
 		if (!node)
@@ -479,14 +462,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
 				tag_set(node, tag, 0);
 		}
 
-		/* Increase the height.  */
-		newheight = root->height;
-		BUG_ON(newheight > BITS_PER_LONG);
-		node->shift = newheight * RADIX_TREE_MAP_SHIFT;
+		BUG_ON(shift > BITS_PER_LONG);
+		node->shift = shift;
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		slot = root->rnode;
 		if (radix_tree_is_indirect_ptr(slot)) {
 			slot = indirect_to_ptr(slot);
 			slot->parent = node;
@@ -495,10 +475,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->slots[0] = slot;
 		node = ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
-		root->height = ++newheight;
-	} while (height > root->height);
+		shift += RADIX_TREE_MAP_SHIFT;
+		slot = node;
+	} while (shift <= maxshift);
 out:
-	return height * RADIX_TREE_MAP_SHIFT;
+	return maxshift + RADIX_TREE_MAP_SHIFT;
 }
 
 /**
@@ -531,15 +512,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 
 	/* Make sure the tree is high enough.  */
 	if (max > maxindex) {
-		int error = radix_tree_extend(root, max);
+		int error = radix_tree_extend(root, max, shift);
 		if (error < 0)
 			return error;
 		shift = error;
 		slot = root->rnode;
-		if (order == shift) {
+		if (order == shift)
 			shift += RADIX_TREE_MAP_SHIFT;
-			root->height++;
-		}
 	}
 
 	offset = 0;			/* uninitialised var warning */
@@ -1412,32 +1391,32 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 /**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	radix_tree_shrink    -    shrink radix tree to minimum height
  *	@root		radix tree root
  */
 static inline bool radix_tree_shrink(struct radix_tree_root *root)
 {
 	bool shrunk = false;
 
-	/* try to shrink tree height */
-	while (root->height > 0) {
+	for (;;) {
 		struct radix_tree_node *to_free = root->rnode;
 		struct radix_tree_node *slot;
 
-		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
+		if (!radix_tree_is_indirect_ptr(to_free))
+			break;
 		to_free = indirect_to_ptr(to_free);
 
 		/*
 		 * The candidate node has more than one child, or its child
-		 * is not at the leftmost slot, or it is a multiorder entry,
-		 * we cannot shrink.
+		 * is not at the leftmost slot, or the child is a multiorder
+		 * entry, we cannot shrink.
 		 */
 		if (to_free->count != 1)
 			break;
 		slot = to_free->slots[0];
 		if (!slot)
 			break;
-		if (!radix_tree_is_indirect_ptr(slot) && (root->height > 1))
+		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
 			break;
 
 		if (radix_tree_is_indirect_ptr(slot)) {
@@ -1454,7 +1433,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		 * one (root->rnode) as far as dependent read barriers go.
 		 */
 		root->rnode = slot;
-		root->height--;
 
 		/*
 		 * We have a dilemma here. The node's slot[0] must not be
@@ -1515,7 +1493,6 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-			root->height = 0;
 			root->rnode = NULL;
 		}
 
@@ -1631,26 +1608,6 @@ radix_tree_node_ctor(void *arg)
 	INIT_LIST_HEAD(&node->private_list);
 }
 
-static __init unsigned long __maxindex(unsigned int height)
-{
-	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
-	int shift = RADIX_TREE_INDEX_BITS - width;
-
-	if (shift < 0)
-		return ~0UL;
-	if (shift >= BITS_PER_LONG)
-		return 0UL;
-	return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
 static int radix_tree_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
@@ -1677,6 +1634,5 @@ void __init radix_tree_init(void)
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
 			radix_tree_node_ctor);
-	radix_tree_init_maxindex();
 	hotcpu_notifier(radix_tree_callback, 0);
 }
-- 
cgit v1.2.3


From 30ff46ccb303fb6f6c28b9aa9f2cdc4ba900ed3f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:22 -0700
Subject: radix-tree: rename INDIRECT_PTR to INTERNAL_NODE

The name RADIX_TREE_INDIRECT_PTR doesn't really match the meaning.
RADIX_TREE_INTERNAL_NODE is a better name.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 30 +++++++++++++-----------------
 lib/radix-tree.c           |  2 +-
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c0d223cfac00..c8cc879046c7 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,20 +29,16 @@
 #include <linux/rcupdate.h>
 
 /*
- * An indirect pointer (root->rnode pointing to a radix_tree_node, rather
- * than a data item) is signalled by the low bit set in the root->rnode
- * pointer.
- *
- * In this case root->height is > 0, but the indirect pointer tests are
- * needed for RCU lookups (because root->height is unreliable). The only
- * time callers need worry about this is when doing a lookup_slot under
- * RCU.
- *
- * Indirect pointer in fact is also used to tag the last pointer of a node
- * when it is shrunk, before we rcu free the node. See shrink code for
- * details.
+ * Entries in the radix tree have the low bit set if they refer to a
+ * radix_tree_node.  If the low bit is clear then the entry is user data.
+ *
+ * We also use the low bit to indicate that the slot will be freed in the
+ * next RCU idle period, and users need to re-walk the tree to find the
+ * new slot for the index that they were looking for.  See the comment in
+ * radix_tree_shrink() for details.
  */
-#define RADIX_TREE_INDIRECT_PTR		1
+#define RADIX_TREE_INTERNAL_NODE	1
+
 /*
  * A common use of the radix tree is to store pointers to struct pages;
  * but shmem/tmpfs needs also to store swap entries in the same tree:
@@ -63,7 +59,7 @@
 
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
-	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
+	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
 }
 
 /*** radix-tree API starts here ***/
@@ -228,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
  */
 static inline int radix_tree_deref_retry(void *arg)
 {
-	return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
+	return unlikely(radix_tree_is_indirect_ptr(arg));
 }
 
 /**
@@ -250,7 +246,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
 static inline int radix_tree_exception(void *arg)
 {
 	return unlikely((unsigned long)arg &
-		(RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
+		(RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY));
 }
 
 /**
@@ -448,7 +444,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 
 static inline void *indirect_to_ptr(void *ptr)
 {
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
 }
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 58f79fee8c71..31d5929a625b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -68,7 +68,7 @@ static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
 static inline void *ptr_to_indirect(void *ptr)
 {
-	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
 }
 
 #define RADIX_TREE_RETRY	ptr_to_indirect(NULL)
-- 
cgit v1.2.3


From 4dd6c0987ca43d6544f4f0a3f86f6ea3bfc60fc1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:27 -0700
Subject: radix-tree: rename indirect_to_ptr() to entry_to_node()

Mirrors the earlier commit introducing node_to_entry().

Also change the type returned to be a struct radix_tree_node pointer.
That lets us simplify a couple of places in the radix tree shrink &
extend paths where we could convert an entry into a pointer, modify the
node, then convert the pointer back into an entry.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h      | 12 +++++------
 lib/radix-tree.c                | 48 ++++++++++++++++++-----------------------
 tools/testing/radix-tree/test.c |  4 ++--
 tools/testing/radix-tree/test.h |  1 -
 4 files changed, 28 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c8cc879046c7..b94aa198dd6b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -442,7 +442,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 	return (iter->next_index - iter->index) >> iter_shift(iter);
 }
 
-static inline void *indirect_to_ptr(void *ptr)
+static inline struct radix_tree_node *entry_to_node(void *ptr)
 {
 	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
 }
@@ -469,7 +469,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			return NULL;
 		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
 					radix_tree_is_indirect_ptr(slot[1])) {
-			if (indirect_to_ptr(slot[1]) == canon) {
+			if (entry_to_node(slot[1]) == canon) {
 				iter->tags >>= 1;
 				iter->index = __radix_tree_iter_add(iter, 1);
 				slot++;
@@ -499,12 +499,10 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 
 			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
 			    radix_tree_is_indirect_ptr(*slot)) {
-				if (indirect_to_ptr(*slot) == canon)
+				if (entry_to_node(*slot) == canon)
 					continue;
-				else {
-					iter->next_index = iter->index;
-					break;
-				}
+				iter->next_index = iter->index;
+				break;
 			}
 
 			if (likely(*slot))
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index f66bb3932452..3c3fdd9c5bb3 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -230,13 +230,13 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 		if (is_sibling_entry(node, entry)) {
 			pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
 					entry, i,
-					*(void **)indirect_to_ptr(entry),
+					*(void **)entry_to_node(entry),
 					first, last);
 		} else if (!radix_tree_is_indirect_ptr(entry)) {
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry), first);
+			dump_node(entry_to_node(entry), first);
 		}
 	}
 }
@@ -249,7 +249,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode), 0);
+	dump_node(entry_to_node(root->rnode), 0);
 }
 #endif
 
@@ -422,7 +422,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	*nodep = node;
 
 	if (likely(radix_tree_is_indirect_ptr(node))) {
-		node = indirect_to_ptr(node);
+		node = entry_to_node(node);
 		*maxindex = node_maxindex(node);
 		return node->shift + RADIX_TREE_MAP_SHIFT;
 	}
@@ -467,11 +467,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		if (radix_tree_is_indirect_ptr(slot)) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = node;
-			slot = node_to_entry(slot);
-		}
+		if (radix_tree_is_indirect_ptr(slot))
+			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
 		rcu_assign_pointer(root->rnode, slot);
@@ -542,7 +539,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			break;
 
 		/* Go a level down */
-		node = indirect_to_ptr(slot);
+		node = entry_to_node(slot);
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(node, &slot, offset);
 	}
@@ -645,7 +642,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 
 		if (node == RADIX_TREE_RETRY)
 			goto restart;
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(parent, &node, offset);
@@ -729,7 +726,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 		BUG_ON(!node);
 
@@ -777,7 +774,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
@@ -844,7 +841,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 
 		if (!node)
@@ -904,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		return NULL;
 
 	if (radix_tree_is_indirect_ptr(rnode)) {
-		rnode = indirect_to_ptr(rnode);
+		rnode = entry_to_node(rnode);
 	} else if (rnode) {
 		/* Single-slot tree */
 		iter->index = index;
@@ -963,7 +960,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		if (!radix_tree_is_indirect_ptr(slot))
 			break;
 
-		node = indirect_to_ptr(slot);
+		node = entry_to_node(slot);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 	}
@@ -1048,7 +1045,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		return 1;
 	}
 
-	node = indirect_to_ptr(slot);
+	node = entry_to_node(slot);
 	shift -= RADIX_TREE_MAP_SHIFT;
 
 	for (;;) {
@@ -1063,7 +1060,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 			goto next;
 		/* Sibling slots never have tags set on them */
 		if (radix_tree_is_indirect_ptr(slot)) {
-			node = indirect_to_ptr(slot);
+			node = entry_to_node(slot);
 			shift -= RADIX_TREE_MAP_SHIFT;
 			continue;
 		}
@@ -1322,7 +1319,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 				}
 				continue;
 			}
-			node = indirect_to_ptr(node);
+			node = entry_to_node(node);
 			if (is_sibling_entry(slot, node))
 				continue;
 			slot = node;
@@ -1367,7 +1364,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 			break;
 		}
 
-		node = indirect_to_ptr(node);
+		node = entry_to_node(node);
 
 		max_index = node_maxindex(node);
 		if (cur_index > max_index) {
@@ -1403,7 +1400,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 
 		if (!radix_tree_is_indirect_ptr(to_free))
 			break;
-		to_free = indirect_to_ptr(to_free);
+		to_free = entry_to_node(to_free);
 
 		/*
 		 * The candidate node has more than one child, or its child
@@ -1418,11 +1415,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
 			break;
 
-		if (radix_tree_is_indirect_ptr(slot)) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = NULL;
-			slot = node_to_entry(slot);
-		}
+		if (radix_tree_is_indirect_ptr(slot))
+			entry_to_node(slot)->parent = NULL;
 
 		/*
 		 * We don't need rcu_assign_pointer(), since we are simply
@@ -1481,7 +1475,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 		struct radix_tree_node *parent;
 
 		if (node->count) {
-			if (node == indirect_to_ptr(root->rnode))
+			if (node == entry_to_node(root->rnode))
 				deleted |= radix_tree_shrink(root);
 			return deleted;
 		}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 3004c58b9021..7b0bc1fa5919 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -149,7 +149,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
 	int i;
 	int j;
 
-	slot = indirect_to_ptr(slot);
+	slot = entry_to_node(slot);
 
 	/* Verify consistency at this level */
 	for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) {
@@ -227,7 +227,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
 		return;
 	}
 
-	node = indirect_to_ptr(node);
+	node = entry_to_node(node);
 	assert(maxindex <= node_maxindex(node));
 
 	shift = node->shift;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 866c8c676aa4..e85131369723 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -39,7 +39,6 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
 extern int nr_allocated;
 
 /* Normally private parts of lib/radix-tree.c */
-void *indirect_to_ptr(void *ptr);
 void radix_tree_dump(struct radix_tree_root *root);
 int root_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned long node_maxindex(struct radix_tree_node *);
-- 
cgit v1.2.3


From b194d16c27af905d6e3552f4851bc7d9fee4e90f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:30 -0700
Subject: radix-tree: rename radix_tree_is_indirect_ptr()

As with indirect_to_ptr(), ptr_to_indirect() and
RADIX_TREE_INDIRECT_PTR, change radix_tree_is_indirect_ptr() to
radix_tree_is_internal_node().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h      | 10 ++++-----
 lib/radix-tree.c                | 48 ++++++++++++++++++++---------------------
 tools/testing/radix-tree/test.c |  4 ++--
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index b94aa198dd6b..bad63105e37e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -57,7 +57,7 @@
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
 
-static inline int radix_tree_is_indirect_ptr(void *ptr)
+static inline int radix_tree_is_internal_node(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
 }
@@ -224,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
  */
 static inline int radix_tree_deref_retry(void *arg)
 {
-	return unlikely(radix_tree_is_indirect_ptr(arg));
+	return unlikely(radix_tree_is_internal_node(arg));
 }
 
 /**
@@ -259,7 +259,7 @@ static inline int radix_tree_exception(void *arg)
  */
 static inline void radix_tree_replace_slot(void **pslot, void *item)
 {
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 	rcu_assign_pointer(*pslot, item);
 }
 
@@ -468,7 +468,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 		if (unlikely(!iter->tags))
 			return NULL;
 		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-					radix_tree_is_indirect_ptr(slot[1])) {
+					radix_tree_is_internal_node(slot[1])) {
 			if (entry_to_node(slot[1]) == canon) {
 				iter->tags >>= 1;
 				iter->index = __radix_tree_iter_add(iter, 1);
@@ -498,7 +498,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			iter->index = __radix_tree_iter_add(iter, 1);
 
 			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-			    radix_tree_is_indirect_ptr(*slot)) {
+			    radix_tree_is_internal_node(*slot)) {
 				if (entry_to_node(*slot) == canon)
 					continue;
 				iter->next_index = iter->index;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3c3fdd9c5bb3..b65c83036ca4 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -100,7 +100,7 @@ static unsigned radix_tree_descend(struct radix_tree_node *parent,
 	void **entry = rcu_dereference_raw(parent->slots[offset]);
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-	if (radix_tree_is_indirect_ptr(entry)) {
+	if (radix_tree_is_internal_node(entry)) {
 		unsigned long siboff = get_slot_offset(parent, entry);
 		if (siboff < RADIX_TREE_MAP_SIZE) {
 			offset = siboff;
@@ -232,7 +232,7 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 					entry, i,
 					*(void **)entry_to_node(entry),
 					first, last);
-		} else if (!radix_tree_is_indirect_ptr(entry)) {
+		} else if (!radix_tree_is_internal_node(entry)) {
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
@@ -247,7 +247,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 	pr_debug("radix root: %p rnode %p tags %x\n",
 			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
-	if (!radix_tree_is_indirect_ptr(root->rnode))
+	if (!radix_tree_is_internal_node(root->rnode))
 		return;
 	dump_node(entry_to_node(root->rnode), 0);
 }
@@ -302,7 +302,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	ret = kmem_cache_alloc(radix_tree_node_cachep,
 			       gfp_mask | __GFP_ACCOUNT);
 out:
-	BUG_ON(radix_tree_is_indirect_ptr(ret));
+	BUG_ON(radix_tree_is_internal_node(ret));
 	return ret;
 }
 
@@ -421,7 +421,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 
 	*nodep = node;
 
-	if (likely(radix_tree_is_indirect_ptr(node))) {
+	if (likely(radix_tree_is_internal_node(node))) {
 		node = entry_to_node(node);
 		*maxindex = node_maxindex(node);
 		return node->shift + RADIX_TREE_MAP_SHIFT;
@@ -467,7 +467,7 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		if (radix_tree_is_indirect_ptr(slot))
+		if (radix_tree_is_internal_node(slot))
 			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
@@ -535,7 +535,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			} else
 				rcu_assign_pointer(root->rnode,
 							node_to_entry(slot));
-		} else if (!radix_tree_is_indirect_ptr(slot))
+		} else if (!radix_tree_is_internal_node(slot))
 			break;
 
 		/* Go a level down */
@@ -585,7 +585,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	void **slot;
 	int error;
 
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 
 	error = __radix_tree_create(root, index, order, &node, &slot);
 	if (error)
@@ -637,7 +637,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 	if (index > maxindex)
 		return NULL;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
 
 		if (node == RADIX_TREE_RETRY)
@@ -720,7 +720,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	shift = radix_tree_load_root(root, &node, &maxindex);
 	BUG_ON(index > maxindex);
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -770,7 +770,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 
 	parent = NULL;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
@@ -835,7 +835,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 	if (node == NULL)
 		return 0;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		int offset;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -900,7 +900,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (index > maxindex)
 		return NULL;
 
-	if (radix_tree_is_indirect_ptr(rnode)) {
+	if (radix_tree_is_internal_node(rnode)) {
 		rnode = entry_to_node(rnode);
 	} else if (rnode) {
 		/* Single-slot tree */
@@ -957,7 +957,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 
 		if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
 			goto restart;
-		if (!radix_tree_is_indirect_ptr(slot))
+		if (!radix_tree_is_internal_node(slot))
 			break;
 
 		node = entry_to_node(slot);
@@ -1039,7 +1039,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		*first_indexp = last_index + 1;
 		return 0;
 	}
-	if (!radix_tree_is_indirect_ptr(slot)) {
+	if (!radix_tree_is_internal_node(slot)) {
 		*first_indexp = last_index + 1;
 		root_tag_set(root, settag);
 		return 1;
@@ -1059,7 +1059,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		if (!tag_get(node, iftag, offset))
 			goto next;
 		/* Sibling slots never have tags set on them */
-		if (radix_tree_is_indirect_ptr(slot)) {
+		if (radix_tree_is_internal_node(slot)) {
 			node = entry_to_node(slot);
 			shift -= RADIX_TREE_MAP_SHIFT;
 			continue;
@@ -1152,7 +1152,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1235,7 +1235,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1311,7 +1311,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 					rcu_dereference_raw(slot->slots[i]);
 			if (node == RADIX_TREE_RETRY)
 				goto out;
-			if (!radix_tree_is_indirect_ptr(node)) {
+			if (!radix_tree_is_internal_node(node)) {
 				if (node == item) {
 					info->found_index = index;
 					info->stop = true;
@@ -1357,7 +1357,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 	do {
 		rcu_read_lock();
 		node = rcu_dereference_raw(root->rnode);
-		if (!radix_tree_is_indirect_ptr(node)) {
+		if (!radix_tree_is_internal_node(node)) {
 			rcu_read_unlock();
 			if (node == item)
 				info.found_index = 0;
@@ -1398,7 +1398,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		struct radix_tree_node *to_free = root->rnode;
 		struct radix_tree_node *slot;
 
-		if (!radix_tree_is_indirect_ptr(to_free))
+		if (!radix_tree_is_internal_node(to_free))
 			break;
 		to_free = entry_to_node(to_free);
 
@@ -1412,10 +1412,10 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		slot = to_free->slots[0];
 		if (!slot)
 			break;
-		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
+		if (!radix_tree_is_internal_node(slot) && to_free->shift)
 			break;
 
-		if (radix_tree_is_indirect_ptr(slot))
+		if (radix_tree_is_internal_node(slot))
 			entry_to_node(slot)->parent = NULL;
 
 		/*
@@ -1445,7 +1445,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		 * also results in a stale slot). So tag the slot as indirect
 		 * to force callers to retry.
 		 */
-		if (!radix_tree_is_indirect_ptr(slot))
+		if (!radix_tree_is_internal_node(slot))
 			to_free->slots[0] = RADIX_TREE_RETRY;
 
 		radix_tree_node_free(to_free);
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 7b0bc1fa5919..a6e8099eaf4f 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -193,7 +193,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
 void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
 {
 	struct radix_tree_node *node = root->rnode;
-	if (!radix_tree_is_indirect_ptr(node))
+	if (!radix_tree_is_internal_node(node))
 		return;
 	verify_node(node, tag, !!root_tag_get(root, tag));
 }
@@ -222,7 +222,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
 {
 	unsigned shift;
 	struct radix_tree_node *node = root->rnode;
-	if (!radix_tree_is_indirect_ptr(node)) {
+	if (!radix_tree_is_internal_node(node)) {
 		assert(maxindex == 0);
 		return;
 	}
-- 
cgit v1.2.3


From d604c324524bf61c68182bb27db64656a78fe911 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:45 -0700
Subject: radix-tree: introduce radix_tree_replace_clear_tags()

In addition to replacing the entry, we also clear all associated tags.
This is really a one-off special for page_cache_tree_delete() which had
far too much detailed knowledge about how the radix tree works.

For efficiency, factor node_tag_clear() out of radix_tree_tag_clear() It
can be used by radix_tree_delete_item() as well as
radix_tree_replace_clear_tags().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  9 ++++--
 lib/radix-tree.c           | 76 ++++++++++++++++++++++++++++------------------
 mm/filemap.c               | 23 ++------------
 3 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index bad63105e37e..11c8e7cc3920 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -281,9 +281,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+				struct radix_tree_root *root,
+				unsigned long index, void *entry);
+unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+			void **results, unsigned long first_index,
+			unsigned int max_items);
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9d9b4b9af4b6..c7114d233b38 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -740,6 +740,26 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+static void node_tag_clear(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (!tag_get(node, tag, offset))
+			return;
+		tag_clear(node, tag, offset);
+		if (any_tag_set(node, tag))
+			return;
+
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -776,28 +796,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
-	if (node == NULL)
-		goto out;
+	if (node)
+		node_tag_clear(root, parent, tag, offset);
 
-	index >>= shift;
-
-	while (parent) {
-		if (!tag_get(parent, tag, offset))
-			goto out;
-		tag_clear(parent, tag, offset);
-		if (any_tag_set(parent, tag))
-			goto out;
-
-		index >>= RADIX_TREE_MAP_SHIFT;
-		offset = index & RADIX_TREE_MAP_MASK;
-		parent = parent->parent;
-	}
-
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
-out:
 	return node;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -1525,14 +1526,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 
 	offset = get_slot_offset(node, slot);
 
-	/*
-	 * Clear all tags associated with the item to be deleted.
-	 * This way of doing it would be inefficient, but seldom is any set.
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(node, tag, offset))
-			radix_tree_tag_clear(root, index, tag);
-	}
+	/* Clear all tags associated with the item to be deleted.  */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		node_tag_clear(root, node, tag, offset);
 
 	delete_sibling_entries(node, node_to_entry(slot), offset);
 	node->slots[offset] = NULL;
@@ -1559,6 +1555,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+struct radix_tree_node *radix_tree_replace_clear_tags(
+			struct radix_tree_root *root,
+			unsigned long index, void *entry)
+{
+	struct radix_tree_node *node;
+	void **slot;
+
+	__radix_tree_lookup(root, index, &node, &slot);
+
+	if (node) {
+		unsigned int tag, offset = get_slot_offset(node, slot);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			node_tag_clear(root, node, tag, offset);
+	} else {
+		/* Clear root node tags */
+		root->gfp_mask &= __GFP_BITS_MASK;
+	}
+
+	radix_tree_replace_slot(slot, entry);
+	return node;
+}
+
 /**
  *	radix_tree_tagged - test whether any items in the tree are tagged
  *	@root:		radix tree root
diff --git a/mm/filemap.c b/mm/filemap.c
index b418405903bc..9665b1d4f318 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 				   struct page *page, void *shadow)
 {
 	struct radix_tree_node *node;
-	unsigned long index;
-	unsigned int offset;
-	unsigned int tag;
-	void **slot;
 
 	VM_BUG_ON(!PageLocked(page));
 
-	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+	node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
+								shadow);
 
 	if (shadow) {
 		mapping->nrexceptional++;
@@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	}
 	mapping->nrpages--;
 
-	if (!node) {
-		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
-		radix_tree_replace_slot(slot, shadow);
+	if (!node)
 		return;
-	}
-
-	/* Clear tree tags for the removed page */
-	index = page->index;
-	offset = index & RADIX_TREE_MAP_MASK;
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (test_bit(offset, node->tags[tag]))
-			radix_tree_tag_clear(&mapping->page_tree, index, tag);
-	}
 
-	/* Delete page, swap shadow entry */
-	radix_tree_replace_slot(slot, shadow);
 	workingset_node_pages_dec(node);
 	if (shadow)
 		workingset_node_shadows_inc(node);
-- 
cgit v1.2.3


From 78a9be0a0a3367b94af242632c525d22b26f1a87 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 20 May 2016 17:03:51 -0700
Subject: dax: move RADIX_DAX_ definitions to dax.c

These don't belong in radix-tree.h any more than PAGECACHE_TAG_* do.
Let's try to maintain the idea that radix-tree simply implements an
abstract data type.

Signed-off-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                   | 9 +++++++++
 include/linux/radix-tree.h | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0dbe4e0f16fe..a345c168acaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,15 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 11c8e7cc3920..c2f69e25ba86 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -48,15 +48,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_internal_node(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
-- 
cgit v1.2.3


From 3bcadd6fa6c4fd07ace3626357c824eb532488a6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:54 -0700
Subject: radix-tree: free up the bottom bit of exceptional entries for reuse

We are guaranteed that pointers to radix_tree_nodes always have the
bottom two bits clear (because they come from a slab cache, and slab
caches have a minimum alignment of sizeof(void *)), so we can redefine
'radix_tree_is_internal_node' to only return true if the bottom two bits
have value '01'.  This frees up one quarter of the potential values for
use by the user.

Idea from Neil Brown.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Suggested-by: Neil Brown <neilb@suse.de>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c2f69e25ba86..cb4b7e8cee81 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,28 +29,37 @@
 #include <linux/rcupdate.h>
 
 /*
- * Entries in the radix tree have the low bit set if they refer to a
- * radix_tree_node.  If the low bit is clear then the entry is user data.
- *
- * We also use the low bit to indicate that the slot will be freed in the
- * next RCU idle period, and users need to re-walk the tree to find the
- * new slot for the index that they were looking for.  See the comment in
- * radix_tree_shrink() for details.
+ * The bottom two bits of the slot determine how the remaining bits in the
+ * slot are interpreted:
+ *
+ * 00 - data pointer
+ * 01 - internal entry
+ * 10 - exceptional entry
+ * 11 - locked exceptional entry
+ *
+ * The internal entry may be a pointer to the next level in the tree, a
+ * sibling entry, or an indicator that the entry in this slot has been moved
+ * to another location in the tree and the lookup should be restarted.  While
+ * NULL fits the 'data pointer' pattern, it means that there is no entry in
+ * the tree for this index (no matter what level of the tree it is found at).
+ * This means that you cannot store NULL in the tree as a value for the index.
  */
-#define RADIX_TREE_INTERNAL_NODE	1
+#define RADIX_TREE_ENTRY_MASK		3UL
+#define RADIX_TREE_INTERNAL_NODE	1UL
 
 /*
- * A common use of the radix tree is to store pointers to struct pages;
- * but shmem/tmpfs needs also to store swap entries in the same tree:
- * those are marked as exceptional entries to distinguish them.
+ * Most users of the radix tree store pointers but shmem/tmpfs stores swap
+ * entries in the same tree.  They are marked as exceptional entries to
+ * distinguish them from pointers to struct page.
  * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
  */
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-static inline int radix_tree_is_internal_node(void *ptr)
+static inline bool radix_tree_is_internal_node(void *ptr)
 {
-	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
+	return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
+				RADIX_TREE_INTERNAL_NODE;
 }
 
 /*** radix-tree API starts here ***/
@@ -236,8 +245,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
  */
 static inline int radix_tree_exception(void *arg)
 {
-	return unlikely((unsigned long)arg &
-		(RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY));
+	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 }
 
 /**
-- 
cgit v1.2.3


From acc93d30d7d43f428272c20a047389c4cbca82ba Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 7 May 2016 11:40:28 -0700
Subject: Revert "block: enable dax for raw block devices"

This reverts commit 5a023cdba50c5f5f2bc351783b3131699deb3937.

The functionality is superseded by the new "Device DAX" facility.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/ioctl.c           | 32 -----------------
 fs/block_dev.c          | 96 +++++++++++++++----------------------------------
 include/linux/fs.h      |  8 -----
 include/uapi/linux/fs.h |  1 -
 4 files changed, 29 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92f89ca..698c7933d582 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -407,35 +407,6 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (!disk->fops->direct_access)
-		return false;
-
-	/*
-	 * If the partition is not aligned on a page boundary, we can't
-	 * do dax I/O to it.
-	 */
-	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-		return false;
-
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
-	return true;
-}
-#endif
-
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
@@ -598,9 +569,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
-	case BLKDAXGET:
-		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
-		break;
 	case IOC_PR_REGISTER:
 		return blkdev_pr_register(bdev, argp);
 	case IOC_PR_RESERVE:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..36ee10ca503e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
+#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -1159,6 +1160,33 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
+static bool blkdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+
+	/*
+	 * If the partition is not aligned on a page boundary, we can't
+	 * do dax I/O to it.
+	 */
+	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1724,79 +1752,13 @@ static const struct address_space_operations def_blk_aops = {
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents.  Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing.  We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
-{
-	return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
-{
-	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
-	.fault		= blkdev_dax_fault,
-	.pmd_fault	= blkdev_dax_pmd_fault,
-	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
-	.fault		= filemap_fault,
-	.map_pages	= filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct inode *bd_inode = bdev_file_inode(file);
-
-	file_accessed(file);
-	if (IS_DAX(bd_inode)) {
-		vma->vm_ops = &blkdev_dax_vm_ops;
-		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-	} else {
-		vma->vm_ops = &blkdev_default_vm_ops;
-	}
-
-	return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.mmap		= blkdev_mmap,
+	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..8363a10660f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,14 +2320,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
 
 extern struct super_block *blockdev_superblock;
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a079d50376e1..fbff8b28aa35 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -222,7 +222,6 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXGET _IO(0x12,129)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
-- 
cgit v1.2.3


From 5726d0b454614a47e641a04c8106392d67a8e1ad Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:06 -0700
Subject: nilfs2: remove FSF mailing address from GPL notices

This removes the extra paragraph which mentions FSF address in GPL
notices from source code of nilfs2 and avoids the checkpatch.pl error
related to it.

Link: http://lkml.kernel.org/r/1461935747-10380-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         | 4 ----
 fs/nilfs2/alloc.h         | 4 ----
 fs/nilfs2/bmap.c          | 4 ----
 fs/nilfs2/bmap.h          | 4 ----
 fs/nilfs2/btnode.c        | 4 ----
 fs/nilfs2/btnode.h        | 4 ----
 fs/nilfs2/btree.c         | 4 ----
 fs/nilfs2/btree.h         | 4 ----
 fs/nilfs2/cpfile.c        | 4 ----
 fs/nilfs2/cpfile.h        | 4 ----
 fs/nilfs2/dat.c           | 4 ----
 fs/nilfs2/dat.h           | 4 ----
 fs/nilfs2/dir.c           | 4 ----
 fs/nilfs2/direct.c        | 4 ----
 fs/nilfs2/direct.h        | 4 ----
 fs/nilfs2/file.c          | 4 ----
 fs/nilfs2/gcinode.c       | 4 ----
 fs/nilfs2/ifile.c         | 4 ----
 fs/nilfs2/ifile.h         | 4 ----
 fs/nilfs2/inode.c         | 4 ----
 fs/nilfs2/ioctl.c         | 4 ----
 fs/nilfs2/mdt.c           | 4 ----
 fs/nilfs2/mdt.h           | 4 ----
 fs/nilfs2/namei.c         | 4 ----
 fs/nilfs2/nilfs.h         | 4 ----
 fs/nilfs2/page.c          | 4 ----
 fs/nilfs2/page.h          | 4 ----
 fs/nilfs2/recovery.c      | 4 ----
 fs/nilfs2/segbuf.c        | 4 ----
 fs/nilfs2/segbuf.h        | 4 ----
 fs/nilfs2/segment.c       | 4 ----
 fs/nilfs2/segment.h       | 4 ----
 fs/nilfs2/sufile.c        | 4 ----
 fs/nilfs2/sufile.h        | 4 ----
 fs/nilfs2/super.c         | 4 ----
 fs/nilfs2/the_nilfs.c     | 4 ----
 fs/nilfs2/the_nilfs.h     | 4 ----
 include/linux/nilfs2_fs.h | 4 ----
 38 files changed, 152 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 2ccbf5531554..eaa0c6af80c3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Original code was written by Koji Sato <koji@osrg.net>.
  * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
  *                                Amagai Yoshiji <amagai@osrg.net>.
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 6e6f49aa53df..62982eea63c0 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Original code was written by Koji Sato <koji@osrg.net>.
  * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
  *                                Amagai Yoshiji <amagai@osrg.net>.
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index a9fb3636c142..fcd79e611c87 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bfa817ce40b3..baa6d404f369 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e0c9daf9aa22..55241effa3c0 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * This file was originally written by Seiji Kihara <kihara@osrg.net>
  * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
  * stabilization and simplification.
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index d876b565ce64..0727096a8c53 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Seiji Kihara <kihara@osrg.net>
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3a3821b00486..cc748ff3b876 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 22c02e35b6ef..9497732b3b1d 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b6596cab9e99..15016eca4dc4 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index a242b9a314f9..3dcaceb66be6 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7dc23f100e57..7427c114aa11 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index cbd8e9732503..edd0586220d4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6723d45a631a..955070bd20a5 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
  */
 /*
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index ebf89fd8ac1a..24700229d6d4 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index dc643de20a25..e4eb1b7e407c 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 088ba001c6ef..49243ca6e3e9 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Amagai Yoshiji <amagai@osrg.net>,
  *            Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 0224b7826ace..b0321886ce33 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
  *            and Ryusuke Konishi <ryusuke@osrg.net>.
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 6548c7851b48..06f9b5aa6175 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Amagai Yoshiji <amagai@osrg.net>.
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
  *
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 679674d13372..c61637e444f1 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Amagai Yoshiji <amagai@osrg.net>
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>
  *
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cfebcd2fc7f3..1bdb17bb01d3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e8fe24882b5b..4db0e5ff6c67 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6982b9153d5..0ab93bdfa261 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 03246cac3338..5fc07ecf32a3 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 3b2af05f9fb4..db0a4bcf6015 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
  *                       Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 385704027575..e2089a18b24b 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>
  *            Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 489391561cda..b10057eb4d8e 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>,
  *            Seiji Kihara <kihara@osrg.net>.
  */
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index a43b8287d012..6df10e127678 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>,
  *            Seiji Kihara <kihara@osrg.net>.
  */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5afa77fadc11..2ae1dfccf6e8 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index f63620ce3892..8aef62d81553 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index b04f08cc2397..fa4ed03744bb 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 483e6634466f..cafb61c32b79 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0408b9b2814b..a118eabd3ed6 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 5b3720414e34..1d602482b51e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
  */
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 35e34c84d4af..c4cf47e14b70 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e05d1848164a..d304faa66f49 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 /*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 69bd801afb53..8e270128ae86 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 23778d385836..426e4acc772b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index e9fcf90b270d..0d7f8141382a 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>
  *            Ryusuke Konishi <ryusuke@osrg.net>
  */
-- 
cgit v1.2.3


From 4b420ab4eedc7a816ad0f2278871019de1a8ccef Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:09 -0700
Subject: nilfs2: clean up old e-mail addresses

E-mail addresses of osrg.net domain are no longer available.  This
removes them from authorship notices and prevents reporters from being
confused.

Link: http://lkml.kernel.org/r/1461935747-10380-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         | 5 ++---
 fs/nilfs2/alloc.h         | 5 ++---
 fs/nilfs2/bmap.c          | 2 +-
 fs/nilfs2/bmap.h          | 2 +-
 fs/nilfs2/btnode.c        | 5 ++---
 fs/nilfs2/btnode.h        | 4 ++--
 fs/nilfs2/btree.c         | 2 +-
 fs/nilfs2/btree.h         | 2 +-
 fs/nilfs2/cpfile.c        | 2 +-
 fs/nilfs2/cpfile.h        | 2 +-
 fs/nilfs2/dat.c           | 2 +-
 fs/nilfs2/dat.h           | 2 +-
 fs/nilfs2/dir.c           | 2 +-
 fs/nilfs2/direct.c        | 2 +-
 fs/nilfs2/direct.h        | 2 +-
 fs/nilfs2/file.c          | 3 +--
 fs/nilfs2/gcinode.c       | 5 ++---
 fs/nilfs2/ifile.c         | 4 ++--
 fs/nilfs2/ifile.h         | 4 ++--
 fs/nilfs2/inode.c         | 2 +-
 fs/nilfs2/ioctl.c         | 2 +-
 fs/nilfs2/mdt.c           | 2 +-
 fs/nilfs2/mdt.h           | 2 +-
 fs/nilfs2/namei.c         | 3 +--
 fs/nilfs2/nilfs.h         | 3 +--
 fs/nilfs2/page.c          | 3 +--
 fs/nilfs2/page.h          | 3 +--
 fs/nilfs2/recovery.c      | 2 +-
 fs/nilfs2/segbuf.c        | 2 +-
 fs/nilfs2/segbuf.h        | 2 +-
 fs/nilfs2/segment.c       | 2 +-
 fs/nilfs2/segment.h       | 2 +-
 fs/nilfs2/sufile.c        | 4 ++--
 fs/nilfs2/sufile.h        | 2 +-
 fs/nilfs2/super.c         | 2 +-
 fs/nilfs2/the_nilfs.c     | 2 +-
 fs/nilfs2/the_nilfs.h     | 2 +-
 include/linux/nilfs2_fs.h | 3 +--
 38 files changed, 46 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eaa0c6af80c3..82362a5d81ed 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- *                                Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
  */
 
 #include <linux/types.h>
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 62982eea63c0..2bd567d98bc6 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- *                                Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
  */
 
 #ifndef _NILFS_ALLOC_H
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index fcd79e611c87..a5eab798a7a3 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/fs.h>
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index baa6d404f369..c14f822682da 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_BMAP_H
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 55241effa3c0..0576033699bc 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * This file was originally written by Seiji Kihara <kihara@osrg.net>
- * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
- * stabilization and simplification.
+ * Originally written by Seiji Kihara.
+ * Fully revised by Ryusuke Konishi for stabilization and simplification.
  *
  */
 
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 0727096a8c53..2cc1b80e18f7 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Seiji Kihara <kihara@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Seiji Kihara.
+ * Revised by Ryusuke Konishi.
  */
 
 #ifndef _NILFS_BTNODE_H
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index cc748ff3b876..8fc73d0a923c 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/slab.h>
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 9497732b3b1d..df1a25faa83b 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_BTREE_H
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 15016eca4dc4..d192b48df9fb 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 3dcaceb66be6..5bdb8262928b 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_CPFILE_H
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7427c114aa11..e92257bc42ee 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/types.h>
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index edd0586220d4..c7035b5b1aed 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_DAT_H
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 955070bd20a5..5756dda3083f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji.
  */
 /*
  *  linux/fs/ext2/dir.c
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 24700229d6d4..22058d0b36e9 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/errno.h>
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index e4eb1b7e407c..3015a6e78724 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_DIRECT_H
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 49243ca6e3e9..547381f3ce13 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Amagai Yoshiji <amagai@osrg.net>,
- *            Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji and Ryusuke Konishi.
  */
 
 #include <linux/fs.h>
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index b0321886ce33..693aded72498 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- *            and Ryusuke Konishi <ryusuke@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi.
+ * Revised by Ryusuke Konishi.
  *
  */
 /*
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 06f9b5aa6175..d9048f6a7780 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Amagai Yoshiji <amagai@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index c61637e444f1..1b7d7afcb35e 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Amagai Yoshiji <amagai@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1bdb17bb01d3..90061151f0c2 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 4db0e5ff6c67..e86599801d8b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/fs.h>
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 0ab93bdfa261..36fa9412f850 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 
 #include <linux/buffer_head.h>
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 5fc07ecf32a3..54e3b7fd783c 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 
 #ifndef _NILFS_MDT_H
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index db0a4bcf6015..8234af99d7d7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
- *                       Ryusuke Konishi <ryusuke@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi.
  */
 /*
  *  linux/fs/ext2/namei.c
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index e2089a18b24b..fa179d4e5dba 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>
- *            Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
  */
 
 #ifndef _NILFS_H
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b10057eb4d8e..19687139f197 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- *            Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
  */
 
 #include <linux/pagemap.h>
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 6df10e127678..041f2dc5e634 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- *            Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
  */
 
 #ifndef _NILFS_PAGE_H
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 2ae1dfccf6e8..402a45c2550d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 
 #include <linux/buffer_head.h>
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 8aef62d81553..52f6a6c8bab1 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fa4ed03744bb..fc3a0fe3119c 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 #ifndef _NILFS_SEGBUF_H
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index cafb61c32b79..ddbfb09527cd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a118eabd3ed6..6cb12dbee7c1 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 #ifndef _NILFS_SEGMENT_H
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1d602482b51e..df439da883bc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Koji Sato.
+ * Revised by Ryusuke Konishi.
  */
 
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index c4cf47e14b70..46e89872294c 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_SUFILE_H
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index d304faa66f49..2ed3e4008e72 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 /*
  *  linux/fs/ext2/super.c
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8e270128ae86..ba4b8189c342 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 426e4acc772b..6a262933fed9 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 0d7f8141382a..823d63d61081 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>
- *            Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
  */
 /*
  *  linux/include/linux/ext2_fs.h
-- 
cgit v1.2.3


From 0c6c44cb9f93f7c0ad803b41ae7c0b08cf6942e2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:39 -0700
Subject: nilfs2: avoid bare use of 'unsigned'

This fixes checkpatch.pl warning "WARNING: Prefer 'unsigned int' to
bare use of 'unsigned'".

Link: http://lkml.kernel.org/r/1462886671-3521-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         |  8 ++++----
 fs/nilfs2/alloc.h         |  2 +-
 fs/nilfs2/bmap.c          |  2 +-
 fs/nilfs2/bmap.h          |  4 ++--
 fs/nilfs2/btree.c         |  3 ++-
 fs/nilfs2/cpfile.c        |  8 +++++---
 fs/nilfs2/cpfile.h        |  4 ++--
 fs/nilfs2/dat.c           |  2 +-
 fs/nilfs2/dat.h           |  2 +-
 fs/nilfs2/dir.c           | 47 ++++++++++++++++++++++++-----------------------
 fs/nilfs2/direct.c        |  5 +++--
 fs/nilfs2/inode.c         | 14 +++++++-------
 fs/nilfs2/mdt.c           |  4 ++--
 fs/nilfs2/mdt.h           |  6 +++---
 fs/nilfs2/namei.c         |  2 +-
 fs/nilfs2/nilfs.h         |  2 +-
 fs/nilfs2/page.c          |  8 ++++----
 fs/nilfs2/page.h          |  3 ++-
 fs/nilfs2/recovery.c      |  4 ++--
 fs/nilfs2/segbuf.c        |  4 ++--
 fs/nilfs2/segbuf.h        |  5 +++--
 fs/nilfs2/segment.c       | 18 +++++++++---------
 fs/nilfs2/segment.h       |  4 ++--
 fs/nilfs2/sysfs.c         |  6 +++---
 fs/nilfs2/the_nilfs.c     |  4 ++--
 fs/nilfs2/the_nilfs.h     | 10 +++++-----
 include/linux/nilfs2_fs.h |  6 +++---
 27 files changed, 97 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index bdd5ac522904..698f582d69af 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -53,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode)
  * @inode: inode of metadata file using this allocator
  * @entry_size: size of the persistent object
  */
-int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 
@@ -384,7 +384,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
  */
 static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 					    unsigned long target,
-					    unsigned bsize,
+					    unsigned int bsize,
 					    spinlock_t *lock)
 {
 	int pos, end = bsize;
@@ -735,8 +735,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 	unsigned long group, group_offset;
 	__u64 group_min_nr, last_nrs[8];
 	const unsigned long epg = nilfs_palloc_entries_per_group(inode);
-	const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
-	unsigned entry_start, end, pos;
+	const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block;
+	unsigned int entry_start, end, pos;
 	spinlock_t *lock;
 	int i, j, k, ret;
 	u32 nfree;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 2bd567d98bc6..05149e606a78 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -37,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode)
 	return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
 }
 
-int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned int);
 int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
 				 struct buffer_head **);
 void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 4976fe3be4a5..f2a7877e0c8c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -93,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 }
 
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
-			     unsigned maxblocks)
+			     unsigned int maxblocks)
 {
 	int ret;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index c14f822682da..a6852807b22c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -57,7 +57,7 @@ struct nilfs_bmap_stats {
 struct nilfs_bmap_operations {
 	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
 	int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
-				 unsigned);
+				 unsigned int);
 	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
 	int (*bop_delete)(struct nilfs_bmap *, __u64);
 	void (*bop_clear)(struct nilfs_bmap *);
@@ -150,7 +150,7 @@ struct nilfs_bmap_store {
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int);
 int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
 int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
 int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 8fc73d0a923c..57ec6af28b49 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -685,7 +685,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
 }
 
 static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
-				     __u64 key, __u64 *ptrp, unsigned maxblocks)
+				     __u64 key, __u64 *ptrp,
+				     unsigned int maxblocks)
 {
 	struct nilfs_btree_path *path;
 	struct nilfs_btree_node *node;
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 16f884bd857c..b61c3e0eb342 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -431,7 +431,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
 }
 
 static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
-					  void *buf, unsigned cisz, size_t nci)
+					  void *buf, unsigned int cisz,
+					  size_t nci)
 {
 	struct nilfs_checkpoint *cp;
 	struct nilfs_cpinfo *ci = buf;
@@ -482,7 +483,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 }
 
 static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
-					  void *buf, unsigned cisz, size_t nci)
+					  void *buf, unsigned int cisz,
+					  size_t nci)
 {
 	struct buffer_head *bh;
 	struct nilfs_cpfile_header *header;
@@ -568,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
  */
 
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
-				void *buf, unsigned cisz, size_t nci)
+				void *buf, unsigned int cisz, size_t nci)
 {
 	switch (mode) {
 	case NILFS_CHECKPOINT:
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 5bdb8262928b..0249744ae234 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -33,8 +33,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
 int nilfs_cpfile_is_snapshot(struct inode *, __u64);
 int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
-				size_t);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *,
+				unsigned int, size_t);
 
 int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
 		      struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index e92257bc42ee..7367610ea807 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -424,7 +424,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	return ret;
 }
 
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 			    size_t nvi)
 {
 	struct buffer_head *entry_bh;
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index c7035b5b1aed..abbfdabcabea 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,7 +47,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t);
 
 int nilfs_dat_read(struct super_block *sb, size_t entry_size,
 		   struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bbcc03de1e74..e506f4f7120a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -46,7 +46,7 @@
  * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
  * more robust, but we have what we have
  */
-static inline unsigned nilfs_chunk_size(struct inode *inode)
+static inline unsigned int nilfs_chunk_size(struct inode *inode)
 {
 	return inode->i_sb->s_blocksize;
 }
@@ -61,9 +61,9 @@ static inline void nilfs_put_page(struct page *page)
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
  */
-static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	unsigned last_byte = inode->i_size;
+	unsigned int last_byte = inode->i_size;
 
 	last_byte -= page_nr << PAGE_SHIFT;
 	if (last_byte > PAGE_SIZE)
@@ -71,7 +71,8 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+			       unsigned int to)
 {
 	loff_t pos = page_offset(page) + from;
 
@@ -80,12 +81,12 @@ static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
 
 static void nilfs_commit_chunk(struct page *page,
 			       struct address_space *mapping,
-			       unsigned from, unsigned to)
+			       unsigned int from, unsigned int to)
 {
 	struct inode *dir = mapping->host;
 	loff_t pos = page_offset(page) + from;
-	unsigned len = to - from;
-	unsigned nr_dirty, copied;
+	unsigned int len = to - from;
+	unsigned int nr_dirty, copied;
 	int err;
 
 	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
@@ -103,10 +104,10 @@ static bool nilfs_check_page(struct page *page)
 {
 	struct inode *dir = page->mapping->host;
 	struct super_block *sb = dir->i_sb;
-	unsigned chunk_size = nilfs_chunk_size(dir);
+	unsigned int chunk_size = nilfs_chunk_size(dir);
 	char *kaddr = page_address(page);
-	unsigned offs, rec_len;
-	unsigned limit = PAGE_SIZE;
+	unsigned int offs, rec_len;
+	unsigned int limit = PAGE_SIZE;
 	struct nilfs_dir_entry *p;
 	char *error;
 
@@ -256,7 +257,6 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 	unsigned int offset = pos & ~PAGE_MASK;
 	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
-/*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
 
 	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
 		return 0;
@@ -318,7 +318,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
 {
 	const unsigned char *name = qstr->name;
 	int namelen = qstr->len;
-	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
 	struct page *page = NULL;
@@ -408,8 +408,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 		    struct page *page, struct inode *inode)
 {
-	unsigned from = (char *) de - (char *) page_address(page);
-	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+	unsigned int from = (char *)de - (char *)page_address(page);
+	unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
 	struct address_space *mapping = page->mapping;
 	int err;
 
@@ -431,15 +431,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 	struct inode *dir = d_inode(dentry->d_parent);
 	const unsigned char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
-	unsigned chunk_size = nilfs_chunk_size(dir);
-	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned int chunk_size = nilfs_chunk_size(dir);
+	unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
 	unsigned short rec_len, name_len;
 	struct page *page = NULL;
 	struct nilfs_dir_entry *de;
 	unsigned long npages = dir_pages(dir);
 	unsigned long n;
 	char *kaddr;
-	unsigned from, to;
+	unsigned int from, to;
 	int err;
 
 	/*
@@ -531,13 +531,14 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	char *kaddr = page_address(page);
-	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-	unsigned to = ((char *)dir - kaddr) +
-		nilfs_rec_len_from_disk(dir->rec_len);
-	struct nilfs_dir_entry *pde = NULL;
-	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+	unsigned int from, to;
+	struct nilfs_dir_entry *de, *pde = NULL;
 	int err;
 
+	from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+	to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len);
+	de = (struct nilfs_dir_entry *)(kaddr + from);
+
 	while ((char *)de < (char *)dir) {
 		if (de->rec_len == 0) {
 			nilfs_error(inode->i_sb, __func__,
@@ -570,7 +571,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page = grab_cache_page(mapping, 0);
-	unsigned chunk_size = nilfs_chunk_size(inode);
+	unsigned int chunk_size = nilfs_chunk_size(inode);
 	struct nilfs_dir_entry *de;
 	int err;
 	void *kaddr;
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 22058d0b36e9..001068630063 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -58,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
 
 static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 				      __u64 key, __u64 *ptrp,
-				      unsigned maxblocks)
+				      unsigned int maxblocks)
 {
 	struct inode *dat = NULL;
 	__u64 ptr, ptr2;
@@ -79,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 		ptr = blocknr;
 	}
 
-	maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+	maxblocks = min_t(unsigned int, maxblocks,
+			  NILFS_DIRECT_KEY_MAX - key + 1);
 	for (cnt = 1; cnt < maxblocks &&
 		     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
 		     NILFS_BMAP_INVALID_PTR;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bbb47e8bde3e..83d2c485efba 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -83,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 blknum = 0;
 	int err = 0, ret;
-	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+	unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits;
 
 	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
@@ -163,7 +163,7 @@ static int nilfs_readpage(struct file *file, struct page *page)
  * @nr_pages - number of pages to be read
  */
 static int nilfs_readpages(struct file *file, struct address_space *mapping,
-			   struct list_head *pages, unsigned nr_pages)
+			   struct list_head *pages, unsigned int nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
 }
@@ -222,7 +222,7 @@ static int nilfs_set_page_dirty(struct page *page)
 	int ret = __set_page_dirty_nobuffers(page);
 
 	if (page_has_buffers(page)) {
-		unsigned nr_dirty = 0;
+		unsigned int nr_dirty = 0;
 		struct buffer_head *bh, *head;
 
 		/*
@@ -245,7 +245,7 @@ static int nilfs_set_page_dirty(struct page *page)
 		if (nr_dirty)
 			nilfs_set_file_dirty(inode, nr_dirty);
 	} else if (ret) {
-		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+		unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
 
 		nilfs_set_file_dirty(inode, nr_dirty);
 	}
@@ -287,8 +287,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 			   struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	unsigned start = pos & (PAGE_SIZE - 1);
-	unsigned nr_dirty;
+	unsigned int start = pos & (PAGE_SIZE - 1);
+	unsigned int nr_dirty;
 	int err;
 
 	nr_dirty = nilfs_page_count_clean_buffers(page, start,
@@ -902,7 +902,7 @@ int nilfs_inode_dirty(struct inode *inode)
 	return ret;
 }
 
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 8a2f8b240f25..3417d859a03c 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -490,8 +490,8 @@ void nilfs_mdt_destroy(struct inode *inode)
 	kfree(mdi);
 }
 
-void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
-			      unsigned header_size)
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
+			      unsigned int header_size)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ffb876e6efed..3f67f3932097 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -53,8 +53,8 @@ struct nilfs_shadow_map {
 struct nilfs_mdt_info {
 	struct rw_semaphore	mi_sem;
 	struct blockgroup_lock *mi_bgl;
-	unsigned		mi_entry_size;
-	unsigned		mi_first_entry_offset;
+	unsigned int		mi_entry_size;
+	unsigned int		mi_first_entry_offset;
 	unsigned long		mi_entries_per_block;
 	struct nilfs_palloc_cache *mi_palloc_cache;
 	struct nilfs_shadow_map *mi_shadow;
@@ -90,7 +90,7 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
 void nilfs_mdt_clear(struct inode *inode);
 void nilfs_mdt_destroy(struct inode *inode);
 
-void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int);
 
 int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 8f8070cffa58..1ec8ae5995a5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -139,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = dir->i_sb;
-	unsigned l = strlen(symname)+1;
+	unsigned int l = strlen(symname) + 1;
 	struct inode *inode;
 	int err;
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index fa179d4e5dba..ea320315d557 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -274,7 +274,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
 int nilfs_permission(struct inode *inode, int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
 extern int __nilfs_mark_inode_dirty(struct inode *, int);
 extern void nilfs_dirty_inode(struct inode *, int flags);
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 19687139f197..d97ba5f11b77 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -435,12 +435,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
 	__nilfs_clear_page_dirty(page);
 }
 
-unsigned nilfs_page_count_clean_buffers(struct page *page,
-					unsigned from, unsigned to)
+unsigned int nilfs_page_count_clean_buffers(struct page *page,
+					    unsigned int from, unsigned int to)
 {
-	unsigned block_start, block_end;
+	unsigned int block_start, block_end;
 	struct buffer_head *bh, *head;
-	unsigned nc = 0;
+	unsigned int nc = 0;
 
 	for (bh = head = page_buffers(page), block_start = 0;
 	     bh != head || !block_start;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 041f2dc5e634..f3687c958fa8 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -53,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
 void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
+					    unsigned int);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
 					    sector_t *blkoff);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 685fa73cecd0..db156a199149 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -152,7 +152,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
 
 	sr = (struct nilfs_super_root *)bh_sr->b_data;
 	if (check) {
-		unsigned bytes = le16_to_cpu(sr->sr_bytes);
+		unsigned int bytes = le16_to_cpu(sr->sr_bytes);
 
 		if (bytes == 0 || bytes > nilfs->ns_blocksize) {
 			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
@@ -504,7 +504,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 {
 	struct inode *inode;
 	struct nilfs_recovery_block *rb, *n;
-	unsigned blocksize = nilfs->ns_blocksize;
+	unsigned int blocksize = nilfs->ns_blocksize;
 	struct page *page;
 	loff_t pos;
 	int err = 0, err2 = 0;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 52f6a6c8bab1..bf36df10540b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -129,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 	return 0;
 }
 
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags,
 		       time_t ctime, __u64 cno)
 {
 	int err;
@@ -236,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
 {
 	struct nilfs_super_root *raw_sr;
 	struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
-	unsigned srsize;
+	unsigned int srsize;
 	u32 crc;
 
 	raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fc3a0fe3119c..7bbccc099709 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -78,7 +78,7 @@ struct nilfs_segment_buffer {
 	__u64			sb_nextnum;
 	sector_t		sb_fseg_start, sb_fseg_end;
 	sector_t		sb_pseg_start;
-	unsigned		sb_rest_blocks;
+	unsigned int		sb_rest_blocks;
 
 	/* Buffers */
 	struct list_head	sb_segsum_buffers;
@@ -120,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
 			   struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
 				  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t,
+		       __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
 				struct buffer_head **);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a6ef1eb15edb..97dee069be83 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -393,10 +393,10 @@ static void nilfs_transaction_unlock(struct super_block *sb)
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
 					    struct nilfs_segsum_pointer *ssp,
-					    unsigned bytes)
+					    unsigned int bytes)
 {
 	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
-	unsigned blocksize = sci->sc_super->s_blocksize;
+	unsigned int blocksize = sci->sc_super->s_blocksize;
 	void *p;
 
 	if (unlikely(ssp->offset + bytes > blocksize)) {
@@ -418,8 +418,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
 	struct buffer_head *sumbh;
-	unsigned sumbytes;
-	unsigned flags = 0;
+	unsigned int sumbytes;
+	unsigned int flags = 0;
 	int err;
 
 	if (nilfs_doing_gc())
@@ -468,9 +468,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
  */
 static int nilfs_segctor_segsum_block_required(
 	struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
-	unsigned binfo_size)
+	unsigned int binfo_size)
 {
-	unsigned blocksize = sci->sc_super->s_blocksize;
+	unsigned int blocksize = sci->sc_super->s_blocksize;
 	/* Size of finfo and binfo is enough small against blocksize */
 
 	return ssp->offset + binfo_size +
@@ -529,7 +529,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
 static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
 					struct buffer_head *bh,
 					struct inode *inode,
-					unsigned binfo_size)
+					unsigned int binfo_size)
 {
 	struct nilfs_segment_buffer *segbuf;
 	int required, err = 0;
@@ -773,7 +773,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
 {
 	struct nilfs_inode_info *ii, *n;
 	struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
-	unsigned nv = 0;
+	unsigned int nv = 0;
 
 	while (!list_empty(head)) {
 		spin_lock(&nilfs->ns_inode_lock);
@@ -954,7 +954,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 {
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
-	unsigned isz, srsz;
+	unsigned int isz, srsz;
 
 	bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
 	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6cb12dbee7c1..27822e760d3f 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
  */
 struct nilfs_cstage {
 	int			scnt;
-	unsigned		flags;
+	unsigned int		flags;
 	struct nilfs_inode_info *dirty_file_ptr;
 	struct nilfs_inode_info *gc_inode_ptr;
 };
@@ -80,7 +80,7 @@ struct nilfs_segment_buffer;
 
 struct nilfs_segsum_pointer {
 	struct buffer_head     *bh;
-	unsigned		offset; /* offset in bytes */
+	unsigned int		offset; /* offset in bytes */
 };
 
 /**
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 3e7d85335adf..8ffa42b704d8 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
 				      struct the_nilfs *nilfs,
 				      char *buf)
 {
-	unsigned sbwcount;
+	unsigned int sbwcount;
 
 	down_read(&nilfs->ns_sem);
 	sbwcount = nilfs->ns_sbwcount;
@@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
 					    struct the_nilfs *nilfs,
 					    char *buf)
 {
-	unsigned sb_update_freq;
+	unsigned int sb_update_freq;
 
 	down_read(&nilfs->ns_sem);
 	sb_update_freq = nilfs->ns_sb_update_freq;
@@ -784,7 +784,7 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
 					    struct the_nilfs *nilfs,
 					    const char *buf, size_t count)
 {
-	unsigned val;
+	unsigned int val;
 	int err;
 
 	err = kstrtouint(skip_spaces(buf), 0, &val);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba4b8189c342..b9e19ca3c96e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -108,8 +108,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 	struct nilfs_super_root *raw_sr;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 	struct nilfs_inode *rawi;
-	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
-	unsigned inode_size;
+	unsigned int dat_entry_size, segment_usage_size, checkpoint_size;
+	unsigned int inode_size;
 	int err;
 
 	err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 62bd7b10fe43..06d2548d436d 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -114,10 +114,10 @@ struct the_nilfs {
 	struct buffer_head     *ns_sbh[2];
 	struct nilfs_super_block *ns_sbp[2];
 	time_t			ns_sbwtime;
-	unsigned		ns_sbwcount;
-	unsigned		ns_sbsize;
-	unsigned		ns_mount_state;
-	unsigned		ns_sb_update_freq;
+	unsigned int		ns_sbwcount;
+	unsigned int		ns_sbsize;
+	unsigned int		ns_mount_state;
+	unsigned int		ns_sb_update_freq;
 
 	/*
 	 * Following fields are dedicated to a writable FS-instance.
@@ -306,7 +306,7 @@ static inline void nilfs_get_root(struct nilfs_root *root)
 
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
 {
-	unsigned valid_fs;
+	unsigned int valid_fs;
 
 	down_read(&nilfs->ns_sem);
 	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 823d63d61081..3b584925d0e8 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -322,9 +322,9 @@ enum {
 					~NILFS_DIR_ROUND)
 #define NILFS_MAX_REC_LEN		((1<<16)-1)
 
-static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
 {
-	unsigned len = le16_to_cpu(dlen);
+	unsigned int len = le16_to_cpu(dlen);
 
 #if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
 	if (len == NILFS_MAX_REC_LEN)
@@ -333,7 +333,7 @@ static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
 	return len;
 }
 
-static inline __le16 nilfs_rec_len_to_disk(unsigned len)
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
 {
 #if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
 	if (len == (1 << 16))
-- 
cgit v1.2.3


From 076a378ba6e6b6ddd5f2336aa0876349b7d36409 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:48 -0700
Subject: nilfs2: fix block comments

This fixes block comments with proper formatting to eliminate the
following checkpatch.pl warnings:

  "WARNING: Block comments use * on subsequent lines"
  "WARNING: Block comments use a trailing */ on a separate line"

Link: http://lkml.kernel.org/r/1462886671-3521-8-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         | 12 +++++---
 fs/nilfs2/bmap.h          | 12 +++++---
 fs/nilfs2/cpfile.c        |  6 ++--
 fs/nilfs2/ifile.c         |  6 ++--
 fs/nilfs2/inode.c         | 73 ++++++++++++++++++++++++++++++-----------------
 fs/nilfs2/nilfs.h         | 24 ++++++++++------
 fs/nilfs2/recovery.c      | 14 +++++----
 fs/nilfs2/segment.c       | 56 ++++++++++++++++++++++++------------
 fs/nilfs2/segment.h       | 34 ++++++++++++++--------
 fs/nilfs2/the_nilfs.c     |  6 ++--
 include/linux/nilfs2_fs.h | 66 +++++++++++++++++++++++++++---------------
 11 files changed, 202 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 698f582d69af..1a85d94f5b25 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -68,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
 	mi->mi_blocks_per_group =
 		DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
 			     mi->mi_entries_per_block) + 1;
-		/* Number of blocks in a group including entry blocks and
-		   a bitmap block */
+		/*
+		 * Number of blocks in a group including entry blocks
+		 * and a bitmap block
+		 */
 	mi->mi_blocks_per_desc_block =
 		nilfs_palloc_groups_per_desc_block(inode) *
 		mi->mi_blocks_per_group + 1;
-		/* Number of blocks per descriptor including the
-		   descriptor block */
+		/*
+		 * Number of blocks per descriptor including the
+		 * descriptor block
+		 */
 	return 0;
 }
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a6852807b22c..b6a4c8f93ac8 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -122,10 +122,14 @@ struct nilfs_bmap {
 
 /* pointer type */
 #define NILFS_BMAP_PTR_P	0	/* physical block number (i.e. LBN) */
-#define NILFS_BMAP_PTR_VS	1	/* virtual block number (single
-					   version) */
-#define NILFS_BMAP_PTR_VM	2	/* virtual block number (has multiple
-					   versions) */
+#define NILFS_BMAP_PTR_VS	1	/*
+					 * virtual block number (single
+					 * version)
+					 */
+#define NILFS_BMAP_PTR_VM	2	/*
+					 * virtual block number (has multiple
+					 * versions)
+					 */
 #define NILFS_BMAP_PTR_U	(-1)	/* never perform pointer operations */
 
 #define NILFS_BMAP_USE_VBN(bmap)	((bmap)->b_ptr_type > 0)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b61c3e0eb342..8a3d3b65af3f 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -870,8 +870,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	void *kaddr;
 	int ret;
 
-	/* CP number is invalid if it's zero or larger than the
-	largest	exist one.*/
+	/*
+	 * CP number is invalid if it's zero or larger than the
+	 * largest existing one.
+	 */
 	if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
 		return -ENOENT;
 	down_read(&NILFS_MDT(cpfile)->mi_sem);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d9048f6a7780..1d2b1805327a 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -64,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
 	struct nilfs_palloc_req req;
 	int ret;
 
-	req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
-				 a group. dull code!! */
+	req.pr_entry_nr = 0;  /*
+			       * 0 says find free inode from beginning
+			       * of a group. dull code!!
+			       */
 	req.pr_entry_bh = NULL;
 
 	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 83d2c485efba..a0ebdb17e912 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -129,11 +129,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
 		set_buffer_delay(bh_result);
-		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
-						      to proper value */
+		map_bh(bh_result, inode->i_sb, 0);
+		/* Disk block number must be changed to proper value */
+
 	} else if (ret == -ENOENT) {
-		/* not found is not error (e.g. hole); must return without
-		   the mapped state flag. */
+		/*
+		 * not found is not error (e.g. hole); must return without
+		 * the mapped state flag.
+		 */
 		;
 	} else {
 		err = ret;
@@ -395,23 +398,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 
 	err = nilfs_init_acl(inode, dir);
 	if (unlikely(err))
-		goto failed_after_creation; /* never occur. When supporting
-				    nilfs_init_acl(), proper cancellation of
-				    above jobs should be considered */
+		/*
+		 * Never occur.  When supporting nilfs_init_acl(),
+		 * proper cancellation of above jobs should be considered.
+		 */
+		goto failed_after_creation;
 
 	return inode;
 
  failed_after_creation:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
-	iput(inode);  /* raw_inode will be deleted through
-			 nilfs_evict_inode() */
+	iput(inode);  /*
+		       * raw_inode will be deleted through
+		       * nilfs_evict_inode().
+		       */
 	goto failed;
 
  failed_ifile_create_inode:
 	make_bad_inode(inode);
-	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
-			 called */
+	iput(inode);
  failed:
 	return ERR_PTR(err);
 }
@@ -662,8 +668,10 @@ void nilfs_write_inode_common(struct inode *inode,
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_device_code =
 			cpu_to_le64(huge_encode_dev(inode->i_rdev));
-	/* When extending inode, nilfs->ns_inode_size should be checked
-	   for substitutions of appended fields */
+	/*
+	 * When extending inode, nilfs->ns_inode_size should be checked
+	 * for substitutions of appended fields.
+	 */
 }
 
 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
@@ -681,9 +689,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 
 	nilfs_write_inode_common(inode, raw_inode, 0);
-		/* XXX: call with has_bmap = 0 is a workaround to avoid
-		   deadlock of bmap. This delays update of i_bmap to just
-		   before writing */
+		/*
+		 * XXX: call with has_bmap = 0 is a workaround to avoid
+		 * deadlock of bmap.  This delays update of i_bmap to just
+		 * before writing.
+		 */
+
 	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 
@@ -748,8 +759,10 @@ void nilfs_truncate(struct inode *inode)
 	nilfs_mark_inode_dirty(inode);
 	nilfs_set_file_dirty(inode, 0);
 	nilfs_transaction_commit(sb);
-	/* May construct a logical segment and may fail in sync mode.
-	   But truncate has no return value. */
+	/*
+	 * May construct a logical segment and may fail in sync mode.
+	 * But truncate has no return value.
+	 */
 }
 
 static void nilfs_clear_inode(struct inode *inode)
@@ -806,8 +819,10 @@ void nilfs_evict_inode(struct inode *inode)
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_transaction_commit(sb);
-	/* May construct a logical segment and may fail in sync mode.
-	   But delete_inode has no return value. */
+	/*
+	 * May construct a logical segment and may fail in sync mode.
+	 * But delete_inode has no return value.
+	 */
 }
 
 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
@@ -915,17 +930,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
 	spin_lock(&nilfs->ns_inode_lock);
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
-		/* Because this routine may race with nilfs_dispose_list(),
-		   we have to check NILFS_I_QUEUED here, too. */
+		/*
+		 * Because this routine may race with nilfs_dispose_list(),
+		 * we have to check NILFS_I_QUEUED here, too.
+		 */
 		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
-			/* This will happen when somebody is freeing
-			   this inode. */
+			/*
+			 * This will happen when somebody is freeing
+			 * this inode.
+			 */
 			nilfs_warning(inode->i_sb, __func__,
 				      "cannot get inode (ino=%lu)",
 				      inode->i_ino);
 			spin_unlock(&nilfs->ns_inode_lock);
-			return -EINVAL; /* NILFS_I_DIRTY may remain for
-					   freeing inode */
+			return -EINVAL; /*
+					 * NILFS_I_DIRTY may remain for
+					 * freeing inode.
+					 */
 		}
 		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
 		set_bit(NILFS_I_QUEUED, &ii->i_state);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index ea320315d557..b1d48bc0532d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -64,8 +64,10 @@ struct nilfs_inode_info {
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
-					   disk inode */
+	struct buffer_head *i_bh;	/*
+					 * i_bh contains a new or dirty
+					 * disk inode.
+					 */
 	struct nilfs_root *i_root;
 	struct inode vfs_inode;
 };
@@ -95,8 +97,10 @@ enum {
 	NILFS_I_NEW = 0,		/* Inode is newly created */
 	NILFS_I_DIRTY,			/* The file is dirty */
 	NILFS_I_QUEUED,			/* inode is in dirty_files list */
-	NILFS_I_BUSY,			/* inode is grabbed by a segment
-					   constructor */
+	NILFS_I_BUSY,			/*
+					 * Inode is grabbed by a segment
+					 * constructor
+					 */
 	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
 	NILFS_I_UPDATED,		/* The file has been written back */
 	NILFS_I_INODE_SYNC,		/* dsync is not allowed for inode */
@@ -140,8 +144,10 @@ enum {
 struct nilfs_transaction_info {
 	u32			ti_magic;
 	void		       *ti_save;
-				/* This should never used. If this happens,
-				   one of other filesystems has a bug. */
+				/*
+				 * This should never be used.  If it happens,
+				 * one of other filesystems has a bug.
+				 */
 	unsigned short		ti_flags;
 	unsigned short		ti_count;
 };
@@ -151,8 +157,10 @@ struct nilfs_transaction_info {
 
 /* ti_flags */
 #define NILFS_TI_DYNAMIC_ALLOC	0x0001  /* Allocated from slab */
-#define NILFS_TI_SYNC		0x0002	/* Force to construct segment at the
-					   end of transaction. */
+#define NILFS_TI_SYNC		0x0002	/*
+					 * Force to construct segment at the
+					 * end of transaction.
+					 */
 #define NILFS_TI_GC		0x0004	/* GC context */
 #define NILFS_TI_COMMIT		0x0008	/* Change happened or not */
 #define NILFS_TI_WRITER		0x0010	/* Constructor context */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index db156a199149..d893dc912b62 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -43,8 +43,10 @@ enum {
 
 /* work structure for recovery */
 struct nilfs_recovery_block {
-	ino_t ino;		/* Inode number of the file that this block
-				   belongs to */
+	ino_t ino;		/*
+				 * Inode number of the file that this block
+				 * belongs to
+				 */
 	sector_t blocknr;	/* block number */
 	__u64 vblocknr;		/* virtual block number */
 	unsigned long blkoff;	/* File offset of the data block (per block) */
@@ -869,9 +871,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
 
 		flags = le16_to_cpu(sum->ss_flags);
 		if (!(flags & NILFS_SS_SR) && !scan_newer) {
-			/* This will never happen because a superblock
-			   (last_segment) always points to a pseg
-			   having a super root. */
+			/*
+			 * This will never happen because a superblock
+			 * (last_segment) always points to a pseg with
+			 * a super root.
+			 */
 			ret = NILFS_SEG_FAIL_CONSISTENCY;
 			goto failed;
 		}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9ee03c262ea..e78b68a81aec 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -45,18 +45,26 @@
  */
 #define SC_N_INODEVEC	16   /* Size of locally allocated inode vector */
 
-#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
-				appended in collection retry loop */
+#define SC_MAX_SEGDELTA 64   /*
+			      * Upper limit of the number of segments
+			      * appended in collection retry loop
+			      */
 
 /* Construction mode */
 enum {
 	SC_LSEG_SR = 1,	/* Make a logical segment having a super root */
-	SC_LSEG_DSYNC,	/* Flush data blocks of a given file and make
-			   a logical segment without a super root */
-	SC_FLUSH_FILE,	/* Flush data files, leads to segment writes without
-			   creating a checkpoint */
-	SC_FLUSH_DAT,	/* Flush DAT file. This also creates segments without
-			   a checkpoint */
+	SC_LSEG_DSYNC,	/*
+			 * Flush data blocks of a given file and make
+			 * a logical segment without a super root.
+			 */
+	SC_FLUSH_FILE,	/*
+			 * Flush data files, leads to segment writes without
+			 * creating a checkpoint.
+			 */
+	SC_FLUSH_DAT,	/*
+			 * Flush DAT file.  This also creates segments
+			 * without a checkpoint.
+			 */
 };
 
 /* Stage numbers of dirty block collection */
@@ -438,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
 {
 	sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
 	if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
-		return -E2BIG; /* The current segment is filled up
-				  (internal code) */
+		return -E2BIG; /*
+				* The current segment is filled up
+				* (internal code)
+				*/
 	sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
 	return nilfs_segctor_reset_segment_buffer(sci);
 }
@@ -869,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
 					  &raw_cp, &bh_cp);
 	if (likely(!err)) {
-		/* The following code is duplicated with cpfile.  But, it is
-		   needed to collect the checkpoint even if it was not newly
-		   created */
+		/*
+		 * The following code is duplicated with cpfile.  But, it is
+		 * needed to collect the checkpoint even if it was not newly
+		 * created.
+		 */
 		mark_buffer_dirty(bh_cp);
 		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
 		nilfs_cpfile_put_checkpoint(
@@ -1400,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs,
 	if (atomic_read(&segbuf->sb_err)) {
 		/* Case 1: The first segment failed */
 		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
-			/* Case 1a:  Partial segment appended into an existing
-			   segment */
+			/*
+			 * Case 1a:  Partial segment appended into an existing
+			 * segment
+			 */
 			nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
 						segbuf->sb_fseg_end);
 		else /* Case 1b:  New full segment */
@@ -1625,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
 static void nilfs_begin_page_io(struct page *page)
 {
 	if (!page || PageWriteback(page))
-		/* For split b-tree node pages, this function may be called
-		   twice.  We ignore the 2nd or later calls by this check. */
+		/*
+		 * For split b-tree node pages, this function may be called
+		 * twice.  We ignore the 2nd or later calls by this check.
+		 */
 		return;
 
 	lock_page(page);
@@ -2679,8 +2695,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 {
 	int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
 
-	/* The segctord thread was stopped and its timer was removed.
-	   But some tasks remain. */
+	/*
+	 * The segctord thread was stopped and its timer was removed.
+	 * But some tasks remain.
+	 */
 	do {
 		struct nilfs_transaction_info ti;
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 27822e760d3f..6565c10b7b76 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -189,11 +189,15 @@ enum {
 	NILFS_SC_DIRTY,		/* One or more dirty meta-data blocks exist */
 	NILFS_SC_UNCLOSED,	/* Logical segment is not closed */
 	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
-	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
-				   checkpoint */
-	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
-				   other than DAT, cpfile, sufile, or files
-				   moved by GC */
+	NILFS_SC_PRIOR_FLUSH,	/*
+				 * Requesting immediate flush without making a
+				 * checkpoint
+				 */
+	NILFS_SC_HAVE_DELTA,	/*
+				 * Next checkpoint will have update of files
+				 * other than DAT, cpfile, sufile, or files
+				 * moved by GC.
+				 */
 };
 
 /* sc_state */
@@ -203,17 +207,23 @@ enum {
 /*
  * Constant parameters
  */
-#define NILFS_SC_CLEANUP_RETRY	    3  /* Retry count of construction when
-					  destroying segctord */
+#define NILFS_SC_CLEANUP_RETRY	    3  /*
+					* Retry count of construction when
+					* destroying segctord
+					*/
 
 /*
  * Default values of timeout, in seconds.
  */
-#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
-					   It triggers construction of a
-					   logical segment with a super root */
-#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
-					   creation */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /*
+					 * Timeout value of dirty blocks.
+					 * It triggers construction of a
+					 * logical segment with a super root.
+					 */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /*
+					 * Maximum frequency of super root
+					 * creation
+					 */
 
 /*
  * The default threshold amount of data, in block counts.
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index b9e19ca3c96e..809bd2de7ad0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -617,8 +617,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
 		if (err)
 			goto out;
-			/* not failed_sbh; sbh is released automatically
-			   when reloading fails. */
+			/*
+			 * Not to failed_sbh; sbh is released automatically
+			 * when reloading fails.
+			 */
 	}
 	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
 	nilfs->ns_blocksize = blocksize;
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 3b584925d0e8..5988dd57ba66 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -127,10 +127,14 @@ struct nilfs_super_root {
 #define NILFS_MOUNT_ERRORS_RO		0x0020  /* Remount fs ro on errors */
 #define NILFS_MOUNT_ERRORS_PANIC	0x0040  /* Panic on errors */
 #define NILFS_MOUNT_BARRIER		0x1000  /* Use block barriers */
-#define NILFS_MOUNT_STRICT_ORDER	0x2000  /* Apply strict in-order
-						   semantics also for data */
-#define NILFS_MOUNT_NORECOVERY		0x4000  /* Disable write access during
-						   mount-time recovery */
+#define NILFS_MOUNT_STRICT_ORDER	0x2000  /*
+						 * Apply strict in-order
+						 * semantics also for data
+						 */
+#define NILFS_MOUNT_NORECOVERY		0x4000  /*
+						 * Disable write access during
+						 * mount-time recovery
+						 */
 #define NILFS_MOUNT_DISCARD		0x8000  /* Issue DISCARD requests */
 
 
@@ -142,16 +146,20 @@ struct nilfs_super_block {
 	__le16	s_minor_rev_level;	/* minor revision level */
 	__le16	s_magic;		/* Magic signature */
 
-	__le16  s_bytes;		/* Bytes count of CRC calculation
-					   for this structure. s_reserved
-					   is excluded. */
+	__le16  s_bytes;		/*
+					 * Bytes count of CRC calculation
+					 * for this structure. s_reserved
+					 * is excluded.
+					 */
 	__le16  s_flags;		/* flags */
 	__le32  s_crc_seed;		/* Seed value of CRC calculation */
 /*10*/	__le32	s_sum;			/* Check sum of super block */
 
-	__le32	s_log_block_size;	/* Block size represented as follows
-					   blocksize =
-					       1 << (s_log_block_size + 10) */
+	__le32	s_log_block_size;	/*
+					 * Block size represented as follows
+					 * blocksize =
+					 *     1 << (s_log_block_size + 10)
+					 */
 	__le64  s_nsegments;		/* Number of segments in filesystem */
 /*20*/	__le64  s_dev_size;		/* block device size in bytes */
 	__le64	s_first_data_block;	/* 1st seg disk block number */
@@ -163,8 +171,10 @@ struct nilfs_super_block {
 	__le64  s_last_seq;             /* seq. number of seg written last */
 /*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
 
-	__le64	s_ctime;		/* Creation time (execution time of
-					   newfs) */
+	__le64	s_ctime;		/*
+					 * Creation time (execution time of
+					 * newfs)
+					 */
 /*60*/	__le64	s_mtime;		/* Mount time */
 	__le64	s_wtime;		/* Write time */
 /*70*/	__le16	s_mnt_count;		/* Mount count */
@@ -188,8 +198,10 @@ struct nilfs_super_block {
 /*A8*/	char	s_volume_name[80];	/* volume name */
 
 /*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
-	__le32  s_c_block_max;          /* Threshold of data amount for
-					   the segment construction */
+	__le32  s_c_block_max;          /*
+					 * Threshold of data amount for
+					 * the segment construction
+					 */
 /*100*/	__le64  s_feature_compat;	/* Compatible feature set */
 	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
 	__le64  s_feature_incompat;	/* Incompatible feature set */
@@ -242,12 +254,18 @@ struct nilfs_super_block {
 
 #define NILFS_SB_OFFSET_BYTES	1024	/* byte offset of nilfs superblock */
 
-#define NILFS_SEG_MIN_BLOCKS	16	/* Minimum number of blocks in
-					   a full segment */
-#define NILFS_PSEG_MIN_BLOCKS	2	/* Minimum number of blocks in
-					   a partial segment */
-#define NILFS_MIN_NRSVSEGS	8	/* Minimum number of reserved
-					   segments */
+#define NILFS_SEG_MIN_BLOCKS	16	/*
+					 * Minimum number of blocks in
+					 * a full segment
+					 */
+#define NILFS_PSEG_MIN_BLOCKS	2	/*
+					 * Minimum number of blocks in
+					 * a partial segment
+					 */
+#define NILFS_MIN_NRSVSEGS	8	/*
+					 * Minimum number of reserved
+					 * segments
+					 */
 
 /*
  * We call DAT, cpfile, and sufile root metadata files.  Inodes of
@@ -513,9 +531,11 @@ struct nilfs_checkpoint {
 	__le64 cp_inodes_count;
 	__le64 cp_blocks_count;
 
-	/* Do not change the byte offset of ifile inode.
-	   To keep the compatibility of the disk format,
-	   additional fields should be added behind cp_ifile_inode. */
+	/*
+	 * Do not change the byte offset of ifile inode.
+	 * To keep the compatibility of the disk format,
+	 * additional fields should be added behind cp_ifile_inode.
+	 */
 	struct nilfs_inode cp_ifile_inode;
 };
 
-- 
cgit v1.2.3


From c96fc2d85f4a827e3bb2abe7de2394a1fb8a0fe7 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 23 May 2016 16:23:57 -0700
Subject: signal: make oom_flags a bool

Currently the size of "struct signal_struct"->oom_flags member is
sizeof(unsigned) bytes, but only one flag OOM_FLAG_ORIGIN which is
updated by current thread is defined.  We can convert OOM_FLAG_ORIGIN
into a bool, and reuse the saved bytes for updating from the OOM killer
and/or the OOM reaper thread.

By the way, do we care about a race window between run_store() and
swapoff() because it would be theoretically possible that two threads
sharing the "struct signal_struct" concurrently call respective
functions? If we care, we can make oom_flags an atomic_t.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h   | 9 +++------
 include/linux/sched.h | 6 +++++-
 include/linux/types.h | 1 -
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index d3f533f2f481..83469522690a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -50,24 +50,21 @@ enum oom_scan_t {
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
-/* Thread is the potential origin of an oom condition; kill first on oom */
-#define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
-
 extern struct mutex oom_lock;
 
 static inline void set_current_oom_origin(void)
 {
-	current->signal->oom_flags |= OOM_FLAG_ORIGIN;
+	current->signal->oom_flag_origin = true;
 }
 
 static inline void clear_current_oom_origin(void)
 {
-	current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
+	current->signal->oom_flag_origin = false;
 }
 
 static inline bool oom_task_origin(const struct task_struct *p)
 {
-	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
+	return p->signal->oom_flag_origin;
 }
 
 extern void mark_oom_victim(struct task_struct *tsk);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c036de6c1ee..21c26e78aec5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -794,7 +794,11 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
-	oom_flags_t oom_flags;
+	/*
+	 * Thread is the potential origin of an oom condition; kill first on
+	 * oom
+	 */
+	bool oom_flag_origin;
 	short oom_score_adj;		/* OOM kill score adjustment */
 	short oom_score_adj_min;	/* OOM kill score adjustment min value.
 					 * Only settable by CAP_SYS_RESOURCE. */
diff --git a/include/linux/types.h b/include/linux/types.h
index 70dd3dfde631..baf718324f4a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,7 +156,6 @@ typedef u32 dma_addr_t;
 
 typedef unsigned __bitwise__ gfp_t;
 typedef unsigned __bitwise__ fmode_t;
-typedef unsigned __bitwise__ oom_flags_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;
-- 
cgit v1.2.3


From 5c8ccefdf46c5f87d87b694c7fbc04941c2c99a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 May 2016 16:24:02 -0700
Subject: signal: move the "sig < SIGRTMIN" check into siginmask(sig)

All the users of siginmask() must ensure that sig < SIGRTMIN.  sig_fatal()
doesn't and this is wrong:

	UBSAN: Undefined behaviour in kernel/signal.c:911:6
	shift exponent 32 is too large for 32-bit type 'long unsigned int'

the patch doesn't add the neccesary check to sig_fatal(), it moves the
check into siginmask() and updates other callers.

Link: http://lkml.kernel.org/r/20160517195052.GA15187@redhat.com
Reported-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 639be264f5f9..b63f63eaa39c 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -400,7 +400,9 @@ int unhandled_signal(struct task_struct *tsk, int sig);
 #else
 #define rt_sigmask(sig)	sigmask(sig)
 #endif
-#define siginmask(sig, mask) (rt_sigmask(sig) & (mask))
+
+#define siginmask(sig, mask) \
+	((sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))
 
 #define SIG_KERNEL_ONLY_MASK (\
 	rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))
@@ -421,14 +423,10 @@ int unhandled_signal(struct task_struct *tsk, int sig);
         rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
 	rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )
 
-#define sig_kernel_only(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
+#define sig_kernel_only(sig)		siginmask(sig, SIG_KERNEL_ONLY_MASK)
+#define sig_kernel_coredump(sig)	siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
+#define sig_kernel_ignore(sig)		siginmask(sig, SIG_KERNEL_IGNORE_MASK)
+#define sig_kernel_stop(sig)		siginmask(sig, SIG_KERNEL_STOP_MASK)
 
 #define sig_user_defined(t, signr) \
 	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
-- 
cgit v1.2.3


From 9b492cf58077a0254eb4b9574029ac6e79add9f9 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:10 -0700
Subject: kexec: introduce a protection mechanism for the crashkernel reserved
 memory

For the cases that some kernel (module) path stamps the crash reserved
memory(already mapped by the kernel) where has been loaded the second
kernel data, the kdump kernel will probably fail to boot when panic
happens (or even not happens) leaving the culprit at large, this is
unacceptable.

The patch introduces a mechanism for detecting such cases:

1) After each crash kexec loading, it simply marks the reserved memory
   regions readonly since we no longer access it after that.  When someone
   stamps the region, the first kernel will panic and trigger the kdump.
   The weak arch_kexec_protect_crashkres() is introduced to do the actual
   protection.

2) To allow multiple loading, once 1) was done we also need to remark
   the reserved memory to readwrite each time a system call related to
   kdump is made.  The weak arch_kexec_unprotect_crashkres() is introduced
   to do the actual protection.

The architecture can make its specific implementation by overriding
arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 2 ++
 kernel/kexec.c        | 9 ++++++++-
 kernel/kexec_core.c   | 6 ++++++
 kernel/kexec_file.c   | 8 +++++++-
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 2cc643c6e870..643ff4a3fbf6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -317,6 +317,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
 					Elf_Shdr *sechdrs, unsigned int relsec);
 int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 					unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);
 
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..b44cb3f5a15c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH)
+	if (flags & KEXEC_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
+
 	if (nr_segments > 0) {
 		unsigned long i;
 
@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	image = xchg(dest_image, image);
 
 out:
+	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5d408252992..48b73cc8e425 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1563,3 +1563,9 @@ void __weak crash_map_reserved_pages(void)
 
 void __weak crash_unmap_reserved_pages(void)
 {}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c72d2ff5896e..503bc2d348e5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_FILE_ON_CRASH)
+	if (flags & KEXEC_FILE_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
 
 	if (flags & KEXEC_FILE_UNLOAD)
 		goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 exchange:
 	image = xchg(dest_image, image);
 out:
+	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 	return ret;
-- 
cgit v1.2.3


From 7a0058ec78602da02b34fa2ae3afc523e90d1ab2 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:22 -0700
Subject: s390/kexec: consolidate crash_map/unmap_reserved_pages() and
 arch_kexec_protect(unprotect)_crashkres()

Commit 3f625002581b ("kexec: introduce a protection mechanism for the
crashkernel reserved memory") is a similar mechanism for protecting the
crash kernel reserved memory to previous crash_map/unmap_reserved_pages()
implementation, the new one is more generic in name and cleaner in code
(besides, some arch may not be allowed to unmap the pgtable).

Therefore, this patch consolidates them, and uses the new
arch_kexec_protect(unprotect)_crashkres() to replace former
crash_map/unmap_reserved_pages() which by now has been only used by
S390.

The consolidation work needs the crash memory to be mapped initially,
this is done in machine_kdump_pm_init() which is after
reserve_crashkernel().  Once kdump kernel is loaded, the new
arch_kexec_protect_crashkres() implemented for S390 will actually
unmap the pgtable like before.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/machine_kexec.c | 28 ++++++++++++++++++----------
 include/linux/kexec.h            |  2 --
 kernel/kexec.c                   | 12 ------------
 kernel/kexec_core.c              | 11 ++---------
 4 files changed, 20 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 2f1b7217c25c..0e64f08d3d69 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -43,13 +43,13 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
 	switch (action) {
 	case PM_SUSPEND_PREPARE:
 	case PM_HIBERNATION_PREPARE:
-		if (crashk_res.start)
-			crash_map_reserved_pages();
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
 		break;
 	case PM_POST_SUSPEND:
 	case PM_POST_HIBERNATION:
-		if (crashk_res.start)
-			crash_unmap_reserved_pages();
+		if (kexec_crash_image)
+			arch_kexec_protect_crashkres();
 		break;
 	default:
 		return NOTIFY_DONE;
@@ -60,6 +60,8 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
 static int __init machine_kdump_pm_init(void)
 {
 	pm_notifier(machine_kdump_pm_cb, 0);
+	/* Create initial mapping for crashkernel memory */
+	arch_kexec_unprotect_crashkres();
 	return 0;
 }
 arch_initcall(machine_kdump_pm_init);
@@ -146,6 +148,8 @@ static int kdump_csum_valid(struct kimage *image)
 #endif
 }
 
+#ifdef CONFIG_CRASH_DUMP
+
 /*
  * Map or unmap crashkernel memory
  */
@@ -167,21 +171,25 @@ static void crash_map_pages(int enable)
 }
 
 /*
- * Map crashkernel memory
+ * Unmap crashkernel memory
  */
-void crash_map_reserved_pages(void)
+void arch_kexec_protect_crashkres(void)
 {
-	crash_map_pages(1);
+	if (crashk_res.end)
+		crash_map_pages(0);
 }
 
 /*
- * Unmap crashkernel memory
+ * Map crashkernel memory
  */
-void crash_unmap_reserved_pages(void)
+void arch_kexec_unprotect_crashkres(void)
 {
-	crash_map_pages(0);
+	if (crashk_res.end)
+		crash_map_pages(1);
 }
 
+#endif
+
 /*
  * Give back memory to hypervisor before new kdump is loaded
  */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 643ff4a3fbf6..e8acb2b43dd9 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -230,8 +230,6 @@ extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
 void crash_save_vmcoreinfo(void);
-void crash_map_reserved_pages(void);
-void crash_unmap_reserved_pages(void);
 void arch_crash_save_vmcoreinfo(void);
 __printf(1, 2)
 void vmcoreinfo_append_str(const char *fmt, ...);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b73dc211fcfd..4384672d3245 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -136,9 +136,6 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (ret)
 		return ret;
 
-	if (flags & KEXEC_ON_CRASH)
-		crash_map_reserved_pages();
-
 	if (flags & KEXEC_PRESERVE_CONTEXT)
 		image->preserve_context = 1;
 
@@ -161,12 +158,6 @@ out:
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
 
-	/*
-	 * Once the reserved memory is mapped, we should unmap this memory
-	 * before returning
-	 */
-	if (flags & KEXEC_ON_CRASH)
-		crash_unmap_reserved_pages();
 	kimage_free(image);
 	return ret;
 }
@@ -232,9 +223,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 
 	result = do_kexec_load(entry, nr_segments, segments, flags);
 
-	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
-		arch_kexec_protect_crashkres();
-
 	mutex_unlock(&kexec_mutex);
 
 	return result;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 48b73cc8e425..56b3ed0927b0 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -954,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size)
 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
-	crash_map_reserved_pages();
 	crash_free_reserved_phys_range(end, crashk_res.end);
 
 	if ((start == end) && (crashk_res.parent != NULL))
@@ -968,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size)
 	crashk_res.end = end - 1;
 
 	insert_resource(&iomem_resource, ram_res);
-	crash_unmap_reserved_pages();
 
 unlock:
 	mutex_unlock(&kexec_mutex);
@@ -1553,17 +1551,12 @@ int kernel_kexec(void)
 }
 
 /*
- * Add and remove page tables for crashkernel memory
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
  *
  * Provide an empty default implementation here -- architecture
  * code may override this
  */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
 void __weak arch_kexec_protect_crashkres(void)
 {}
 
-- 
cgit v1.2.3


From 9fbeb5ab59a2b2a09cca2eb68283e7a090d4b98d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:30 -0700
Subject: mm: make vm_mmap killable

All the callers of vm_mmap seem to check for the failure already and
bail out in one way or another on the error which means that we can
change it to use killable version of vm_mmap_pgoff and return -EINTR if
the current task gets killed while waiting for mmap_sem.  This also
means that vm_mmap_pgoff can be killable by default and drop the
additional parameter.

This will help in the OOM conditions when the oom victim might be stuck
waiting for the mmap_sem for write which in turn can block oom_reaper
which relies on the mmap_sem for read to make a forward progress and
reclaim the address space of the victim.

Please note that load_elf_binary is ignoring vm_mmap error for
current->personality & MMAP_PAGE_ZERO case but that shouldn't be a
problem because the address is not used anywhere and we never return to
the userspace if we got killed.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/internal.h      |  3 +--
 mm/mmap.c          |  2 +-
 mm/nommu.c         |  2 +-
 mm/util.c          | 13 ++++---------
 5 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b530c99e8e81..d5eb8dddd7c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2013,7 +2013,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
-extern unsigned long vm_mmap(struct file *, unsigned long,
+extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
         unsigned long, unsigned long);
 
diff --git a/mm/internal.h b/mm/internal.h
index bff7fd702331..a37e5b6f9d25 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,8 +444,7 @@ extern u32 hwpoison_filter_enable;
 
 extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long,
-        unsigned long, unsigned long,
-        bool);
+        unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
diff --git a/mm/mmap.c b/mm/mmap.c
index 11e1f2ca72af..420088682d4a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1333,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 out_fput:
 	if (file)
 		fput(file);
diff --git a/mm/nommu.c b/mm/nommu.c
index b74512746aae..c8bd59a03c71 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
 	if (file)
 		fput(file);
diff --git a/mm/util.c b/mm/util.c
index 03b237746850..917e0e3d0f8e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
-	unsigned long flag, unsigned long pgoff, bool killable)
+	unsigned long flag, unsigned long pgoff)
 {
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
@@ -297,12 +297,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
-		if (killable) {
-			if (down_write_killable(&mm->mmap_sem))
-				return -EINTR;
-		} else {
-			down_write(&mm->mmap_sem);
-		}
+		if (down_write_killable(&mm->mmap_sem))
+			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 				    &populate);
 		up_write(&mm->mmap_sem);
@@ -312,7 +308,6 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	return ret;
 }
 
-/* XXX are all callers checking an error */
 unsigned long vm_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long offset)
@@ -322,7 +317,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 	if (unlikely(offset_in_page(offset)))
 		return -EINVAL;
 
-	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false);
+	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
 }
 EXPORT_SYMBOL(vm_mmap);
 
-- 
cgit v1.2.3


From 2d6c928241add2848e4eebfce407e95164229976 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:42 -0700
Subject: mm: make vm_brk killable

Now that all the callers handle vm_brk failure we can change it wait for
mmap_sem killable to help oom_reaper to not get blocked just because
vm_brk gets blocked behind mmap_sem readers.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/mmap.c          | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d5eb8dddd7c0..2835d598d258 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,7 +2011,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long vm_brk(unsigned long, unsigned long);
+extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index ca292a7c2b68..d3d9a94ca031 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2712,12 +2712,9 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	unsigned long ret;
 	bool populate;
 
-	/*
-	 * XXX not all users are chcecking the return value, convert
-	 * to down_write_killable after they are able to cope with
-	 * error
-	 */
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
-- 
cgit v1.2.3


From b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 May 2016 11:19:07 +0200
Subject: sched/core: Fix remote wakeups

Commit:

  b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")

... introduced a bug: Mike Galbraith found that it introduced a
performance regression, while Paul E. McKenney reported lost
wakeups and bisected it to this commit.

The reason is that I mis-read ttwu_queue() such that I assumed any
wakeup that got a remote queue must have had the task migrated.

Since this is not so; we need to transfer this information between
queueing the wakeup and actually doing the wakeup. Use a new
task_struct::sched_flag for this, we already write to
sched_contributes_to_load in the wakeup path so this is a hot and
modified cacheline.

Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ben Segall <bsegall@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")
Link: http://lkml.kernel.org/r/20160523091907.GD15728@worktop.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6cc0df970f1a..e053517a88b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1533,6 +1533,7 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
+	unsigned sched_remote_wakeup:1;
 	unsigned :0; /* force alignment to the next boundary */
 
 	/* unserialized, strictly 'current' */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 404c0784b1fc..7f2cae4620c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
 	cookie = lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
+		int wake_flags = 0;
+
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		/*
-		 * See ttwu_queue(); we only call ttwu_queue_remote() when
-		 * its a x-cpu wakeup.
-		 */
-		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+		if (p->sched_remote_wakeup)
+			wake_flags = WF_MIGRATED;
+
+		ttwu_do_activate(rq, p, wake_flags, cookie);
 	}
 
 	lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
 	irq_exit();
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
 
+	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
 		if (!set_nr_if_polling(rq->idle))
 			smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
-		ttwu_queue_remote(p, cpu);
+		ttwu_queue_remote(p, cpu, wake_flags);
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 536a6f88c49dd739961ffd53774775afed852c83 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Wed, 18 May 2016 13:26:23 +0200
Subject: KVM: Create debugfs dir and stat files for each VM

This patch adds a kvm debugfs subdirectory for each VM, which is named
after its pid and file descriptor. The directories contain the same
kind of files that are already in the kvm debugfs directory, but the
data exported through them is now VM specific.

This makes the debugfs kvm data a convenient alternative to the
tracepoints which already have per VM data. The debugfs data is easy
to read and low overhead.

CC: Dan Carpenter <dan.carpenter@oracle.com> [includes fixes by Dan Carpenter]
Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |   7 ++
 virt/kvm/kvm_main.c      | 187 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 184 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b1fa8f11c95b..1c9c973a7dd9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -412,6 +412,8 @@ struct kvm {
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
+	struct dentry *debugfs_dentry;
+	struct kvm_stat_data **debugfs_stat_data;
 };
 
 #define kvm_err(fmt, ...) \
@@ -991,6 +993,11 @@ enum kvm_stat_kind {
 	KVM_STAT_VCPU,
 };
 
+struct kvm_stat_data {
+	int offset;
+	struct kvm *kvm;
+};
+
 struct kvm_stats_debugfs_item {
 	const char *name;
 	int offset;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dd4ac9d9e8f5..37af23052470 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -63,6 +63,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+/* Worst case buffer size needed for holding an integer. */
+#define ITOA_MAX_LEN 12
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -100,6 +103,9 @@ static __read_mostly struct preempt_ops kvm_preempt_ops;
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
+static int kvm_debugfs_num_entries;
+static const struct file_operations *stat_fops_per_vm[];
+
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
 #ifdef CONFIG_KVM_COMPAT
@@ -542,6 +548,58 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 	kvfree(slots);
 }
 
+static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+{
+	int i;
+
+	if (!kvm->debugfs_dentry)
+		return;
+
+	debugfs_remove_recursive(kvm->debugfs_dentry);
+
+	for (i = 0; i < kvm_debugfs_num_entries; i++)
+		kfree(kvm->debugfs_stat_data[i]);
+	kfree(kvm->debugfs_stat_data);
+}
+
+static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+{
+	char dir_name[ITOA_MAX_LEN * 2];
+	struct kvm_stat_data *stat_data;
+	struct kvm_stats_debugfs_item *p;
+
+	if (!debugfs_initialized())
+		return 0;
+
+	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
+	kvm->debugfs_dentry = debugfs_create_dir(dir_name,
+						 kvm_debugfs_dir);
+	if (!kvm->debugfs_dentry)
+		return -ENOMEM;
+
+	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
+					 sizeof(*kvm->debugfs_stat_data),
+					 GFP_KERNEL);
+	if (!kvm->debugfs_stat_data)
+		return -ENOMEM;
+
+	for (p = debugfs_entries; p->name; p++) {
+		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+		if (!stat_data)
+			return -ENOMEM;
+
+		stat_data->kvm = kvm;
+		stat_data->offset = p->offset;
+		kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
+		if (!debugfs_create_file(p->name, 0444,
+					 kvm->debugfs_dentry,
+					 stat_data,
+					 stat_fops_per_vm[p->kind]))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
 	int r, i;
@@ -647,6 +705,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	int i;
 	struct mm_struct *mm = kvm->mm;
 
+	kvm_destroy_vm_debugfs(kvm);
 	kvm_arch_sync_events(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
@@ -2999,8 +3058,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	}
 #endif
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
-	if (r < 0)
+	if (r < 0) {
 		kvm_put_kvm(kvm);
+		return r;
+	}
+
+	if (kvm_create_vm_debugfs(kvm, r) < 0) {
+		kvm_put_kvm(kvm);
+		return -ENOMEM;
+	}
 
 	return r;
 }
@@ -3425,15 +3491,114 @@ static struct notifier_block kvm_cpu_notifier = {
 	.notifier_call = kvm_cpu_hotplug,
 };
 
+static int kvm_debugfs_open(struct inode *inode, struct file *file,
+			   int (*get)(void *, u64 *), int (*set)(void *, u64),
+			   const char *fmt)
+{
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+					  inode->i_private;
+
+	/* The debugfs files are a reference to the kvm struct which
+	 * is still valid when kvm_destroy_vm is called.
+	 * To avoid the race between open and the removal of the debugfs
+	 * directory we test against the users count.
+	 */
+	if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+		return -ENOENT;
+
+	if (simple_attr_open(inode, file, get, set, fmt)) {
+		kvm_put_kvm(stat_data->kvm);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int kvm_debugfs_release(struct inode *inode, struct file *file)
+{
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+					  inode->i_private;
+
+	simple_attr_release(inode, file);
+	kvm_put_kvm(stat_data->kvm);
+
+	return 0;
+}
+
+static int vm_stat_get_per_vm(void *data, u64 *val)
+{
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+
+	*val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+
+	return 0;
+}
+
+static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+	__simple_attr_check_format("%llu\n", 0ull);
+	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
+				NULL, "%llu\n");
+}
+
+static const struct file_operations vm_stat_get_per_vm_fops = {
+	.owner   = THIS_MODULE,
+	.open    = vm_stat_get_per_vm_open,
+	.release = kvm_debugfs_release,
+	.read    = simple_attr_read,
+	.write   = simple_attr_write,
+	.llseek  = generic_file_llseek,
+};
+
+static int vcpu_stat_get_per_vm(void *data, u64 *val)
+{
+	int i;
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+	struct kvm_vcpu *vcpu;
+
+	*val = 0;
+
+	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+		*val += *(u32 *)((void *)vcpu + stat_data->offset);
+
+	return 0;
+}
+
+static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+	__simple_attr_check_format("%llu\n", 0ull);
+	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
+				 NULL, "%llu\n");
+}
+
+static const struct file_operations vcpu_stat_get_per_vm_fops = {
+	.owner   = THIS_MODULE,
+	.open    = vcpu_stat_get_per_vm_open,
+	.release = kvm_debugfs_release,
+	.read    = simple_attr_read,
+	.write   = simple_attr_write,
+	.llseek  = generic_file_llseek,
+};
+
+static const struct file_operations *stat_fops_per_vm[] = {
+	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
+	[KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+};
+
 static int vm_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
+	struct kvm_stat_data stat_tmp = {.offset = offset};
+	u64 tmp_val;
 
 	*val = 0;
 	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		*val += *(u32 *)((void *)kvm + offset);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		stat_tmp.kvm = kvm;
+		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+		*val += tmp_val;
+	}
 	spin_unlock(&kvm_lock);
 	return 0;
 }
@@ -3444,15 +3609,16 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-	int i;
+	struct kvm_stat_data stat_tmp = {.offset = offset};
+	u64 tmp_val;
 
 	*val = 0;
 	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			*val += *(u32 *)((void *)vcpu + offset);
-
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		stat_tmp.kvm = kvm;
+		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+		*val += tmp_val;
+	}
 	spin_unlock(&kvm_lock);
 	return 0;
 }
@@ -3473,7 +3639,8 @@ static int kvm_init_debug(void)
 	if (kvm_debugfs_dir == NULL)
 		goto out;
 
-	for (p = debugfs_entries; p->name; ++p) {
+	kvm_debugfs_num_entries = 0;
+	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
 		if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
 					 (void *)(long)p->offset,
 					 stat_fops[p->kind]))
-- 
cgit v1.2.3


From 13d1ad16d05eebb4db977eb955716b9da2c19fbd Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 27 Apr 2016 14:15:51 +0200
Subject: libceph: move message allocation out of ceph_osdc_alloc_request()

The size of ->r_request and ->r_reply messages depends on the size of
the object name (ceph_object_id), while the size of ceph_osd_request is
fixed.  Move message allocation into a separate function that would
have to be called after ceph_object_id and ceph_object_locator (which
is also going to become variable in size with RADOS namespaces) have
been filled in:

    req = ceph_osdc_alloc_request(...);
    <fill in req->r_base_oid>
    <fill in req->r_base_oloc>
    ceph_osdc_alloc_messages(req);

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             | 18 ++++++++-
 fs/ceph/addr.c                  |  8 ++++
 fs/ceph/file.c                  |  7 ++++
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/osd_client.c           | 88 +++++++++++++++++++++++------------------
 5 files changed, 82 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c3089f32a392..bda4deade82e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1954,7 +1954,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
 					  GFP_NOIO);
 	if (!osd_req)
-		return NULL;	/* ENOMEM */
+		goto fail;
 
 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1967,7 +1967,14 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto fail;
+
 	return osd_req;
+
+fail:
+	ceph_osdc_put_request(osd_req);
+	return NULL;
 }
 
 /*
@@ -2003,7 +2010,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
 						false, GFP_NOIO);
 	if (!osd_req)
-		return NULL;	/* ENOMEM */
+		goto fail;
 
 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
 	osd_req->r_callback = rbd_osd_req_callback;
@@ -2012,7 +2019,14 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto fail;
+
 	return osd_req;
+
+fail:
+	ceph_osdc_put_request(osd_req);
+	return NULL;
 }
 
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e61fc8bb371..6fee7e0b8931 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1762,6 +1762,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		 "%llx.00000000", ci->i_vino.ino);
 	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
 
+	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+	if (err)
+		goto out_unlock;
+
 	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
 					 1, false, GFP_NOFS);
 	if (!wr_req) {
@@ -1775,6 +1779,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	wr_req->r_base_oloc.pool = pool;
 	wr_req->r_base_oid = rd_req->r_base_oid;
 
+	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+	if (err)
+		goto out_unlock;
+
 	/* one page should be large enough for STAT data */
 	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
 	if (IS_ERR(pages)) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a79f9269831e..5d46d106bbb7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -717,6 +717,13 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_base_oloc = orig_req->r_base_oloc;
 	req->r_base_oid = orig_req->r_base_oid;
 
+	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (ret) {
+		ceph_osdc_put_request(req);
+		req = orig_req;
+		goto out;
+	}
+
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..66a1fcd5bff7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -322,6 +322,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 				    struct ceph_snap_context *snapc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccb9539dc780..d66dacc9d0d4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -369,8 +369,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
-	struct ceph_msg *msg;
-	size_t msg_size;
 
 	if (use_mempool) {
 		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -407,53 +405,59 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	req->r_base_oloc.pool = -1;
 	req->r_target_oloc.pool = -1;
 
-	msg_size = OSD_OPREPLY_FRONT_LEN;
-	if (num_ops > CEPH_OSD_SLAB_OPS) {
-		/* ceph_osd_op and rval */
-		msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
-			    (sizeof(struct ceph_osd_op) + 4);
-	}
+	dout("%s req %p\n", __func__, req);
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-	/* create reply message */
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-				   gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	req->r_reply = msg;
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_msg *msg;
+	int msg_size;
 
+	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4; /* pgid */
-	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
-	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+	msg_size += 4 + req->r_base_oid.name_len; /* oid */
+	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
 	msg_size += 8; /* snapid */
 	msg_size += 8; /* snap_seq */
-	msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
 	msg_size += 4; /* retry_attempt */
 
-	/* create request message; allow space for oid */
-	if (use_mempool)
+	if (req->r_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
 
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
-
 	req->r_request = msg;
 
-	return req;
+	/* create reply message */
+	msg_size = OSD_OPREPLY_FRONT_LEN;
+	if (req->r_num_ops > CEPH_OSD_SLAB_OPS) {
+		/* ceph_osd_op and rval */
+		msg_size += (req->r_num_ops - CEPH_OSD_SLAB_OPS) *
+			    (sizeof(struct ceph_osd_op) + 4);
+	}
+
+	if (req->r_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
+
+	req->r_reply = msg;
+
+	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -828,17 +832,17 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
+	if (!req) {
+		r = -ENOMEM;
+		goto fail;
+	}
 
 	req->r_flags = flags;
 
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-	if (r < 0) {
-		ceph_osdc_put_request(req);
-		return ERR_PTR(r);
-	}
+	if (r)
+		goto fail;
 
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
 		osd_req_op_init(req, which, opcode, 0);
@@ -864,7 +868,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		 "%llx.%08llx", vino.ino, objnum);
 	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
 
+	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (r)
+		goto fail;
+
 	return req;
+
+fail:
+	ceph_osdc_put_request(req);
+	return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
-- 
cgit v1.2.3


From d30291b985d1854565d7f2c82a4457869d5265e8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 29 Apr 2016 19:54:20 +0200
Subject: libceph: variable-sized ceph_object_id

Currently ceph_object_id can hold object names of up to 100
(CEPH_MAX_OID_NAME_LEN) characters.  This is enough for all use cases,
expect one - long rbd image names:

- a format 1 header is named "<imgname>.rbd"
- an object that points to a format 2 header is named "rbd_id.<imgname>"

We operate on these potentially long-named objects during rbd map, and,
for format 1 images, during header refresh.  (A format 2 header name is
a small system-generated string.)

Lift this 100 character limit by making ceph_object_id be able to point
to an externally-allocated string.  Apart from being able to work with
almost arbitrarily-long named objects, this allows us to reduce the
size of ceph_object_id from >100 bytes to 64 bytes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c         |  8 +++-
 fs/ceph/addr.c              |  6 +--
 fs/ceph/file.c              |  2 +-
 fs/ceph/ioctl.c             |  2 +-
 include/linux/ceph/osdmap.h | 62 ++++++++++++++++++------------
 net/ceph/debugfs.c          |  2 +-
 net/ceph/osd_client.c       | 16 +++++---
 net/ceph/osdmap.c           | 93 ++++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 150 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bda4deade82e..3bf93a2a20f0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1965,7 +1965,9 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_priv = obj_request;
 
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+			     obj_request->object_name))
+		goto fail;
 
 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
 		goto fail;
@@ -2017,7 +2019,9 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_priv = obj_request;
 
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+			     obj_request->object_name))
+		goto fail;
 
 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
 		goto fail;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6fee7e0b8931..6f28dd9bacb2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1758,9 +1758,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	rd_req->r_flags = CEPH_OSD_FLAG_READ;
 	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
 	rd_req->r_base_oloc.pool = pool;
-	snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
-		 "%llx.00000000", ci->i_vino.ino);
-	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+	ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
 
 	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
 	if (err)
@@ -1777,7 +1775,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	wr_req->r_base_oloc.pool = pool;
-	wr_req->r_base_oid = rd_req->r_base_oid;
+	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
 
 	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
 	if (err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d46d106bbb7..9d470397e249 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -715,7 +715,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
 	req->r_base_oloc = orig_req->r_base_oloc;
-	req->r_base_oid = orig_req->r_base_oid;
+	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (ret) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..db296709784a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -213,7 +213,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		 ceph_ino(inode), dl.object_no);
 
 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-	ceph_oid_set_name(&oid, dl.object_name);
+	ceph_oid_printf(&oid, "%s", dl.object_name);
 
 	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
 	if (r < 0) {
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..777a29412706 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -64,11 +64,47 @@ struct ceph_object_locator {
  */
 #define CEPH_MAX_OID_NAME_LEN 100
 
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-	char name[CEPH_MAX_OID_NAME_LEN];
+	char *name;
+	char inline_name[CEPH_OID_INLINE_LEN];
 	int name_len;
 };
 
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+	oid->name = oid->inline_name;
+	oid->name_len = 0;
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+	return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
 struct ceph_pg_mapping {
 	struct rb_node node;
 	struct ceph_pg pgid;
@@ -113,30 +149,6 @@ struct ceph_osdmap {
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
-				     const char *name)
-{
-	int len;
-
-	len = strlen(name);
-	if (len > sizeof(oid->name)) {
-		WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-		     name, len, sizeof(oid->name));
-		len = sizeof(oid->name);
-	}
-
-	memcpy(oid->name, name, len);
-	oid->name_len = len;
-}
-
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-				 struct ceph_object_id *src)
-{
-	BUG_ON(src->name_len > sizeof(dest->name));
-	memcpy(dest->name, src->name, src->name_len);
-	dest->name_len = src->name_len;
-}
-
 static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
 	return osd >= 0 && osd < map->max_osd &&
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..6f8413293d15 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			   req->r_osd ? req->r_osd->o_osd : -1,
 			   req->r_pgid.pool, req->r_pgid.seed);
 
-		seq_printf(s, "%.*s", req->r_base_oid.name_len,
+		seq_printf(s, "%*pE", req->r_base_oid.name_len,
 			   req->r_base_oid.name);
 
 		if (req->r_reassert_version.epoch)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 75e27bd3d372..95910aed8e2e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -334,7 +334,10 @@ static void ceph_osdc_release_request(struct kref *kref)
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
 
+	ceph_oid_destroy(&req->r_base_oid);
+	ceph_oid_destroy(&req->r_target_oid);
 	ceph_put_snap_context(req->r_snapc);
+
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -401,7 +404,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
+	ceph_oid_init(&req->r_base_oid);
 	req->r_base_oloc.pool = -1;
+	ceph_oid_init(&req->r_target_oid);
 	req->r_target_oloc.pool = -1;
 
 	dout("%s req %p\n", __func__, req);
@@ -415,6 +420,8 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
 	struct ceph_msg *msg;
 	int msg_size;
 
+	WARN_ON(ceph_oid_empty(&req->r_base_oid));
+
 	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
@@ -859,10 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
-
-	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
-		 "%llx.%08llx", vino.ino, objnum);
-	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
 	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (r)
@@ -1410,7 +1414,7 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 		req->r_target_oloc = req->r_base_oloc; /* struct */
 		need_check_tiering = true;
 	}
-	if (req->r_target_oid.name_len == 0) {
+	if (ceph_oid_empty(&req->r_target_oid)) {
 		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
 		need_check_tiering = true;
 	}
@@ -2501,7 +2505,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 	/* oid */
 	ceph_encode_32(&p, req->r_base_oid.name_len);
 	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+	dout("oid %*pE len %d\n", req->r_base_oid.name_len,
 	     req->r_base_oid.name, req->r_base_oid.name_len);
 	p += req->r_base_oid.name_len;
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..4668b871ca47 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1381,8 +1381,99 @@ bad:
 	return ERR_PTR(err);
 }
 
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src)
+{
+	WARN_ON(!ceph_oid_empty(dest));
+
+	if (src->name != src->inline_name) {
+		/* very rare, see ceph_object_id definition */
+		dest->name = kmalloc(src->name_len + 1,
+				     GFP_NOIO | __GFP_NOFAIL);
+	}
 
+	memcpy(dest->name, src->name, src->name_len + 1);
+	dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
 
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+	int len;
+
+	WARN_ON(!ceph_oid_empty(oid));
+
+	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+	if (len >= sizeof(oid->inline_name))
+		return len;
+
+	oid->name_len = len;
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	BUG_ON(oid_printf_vargs(oid, fmt, ap));
+	va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+		      const char *fmt, va_list ap)
+{
+	va_list aq;
+	int len;
+
+	va_copy(aq, ap);
+	len = oid_printf_vargs(oid, fmt, aq);
+	va_end(aq);
+
+	if (len) {
+		char *external_name;
+
+		external_name = kmalloc(len + 1, gfp);
+		if (!external_name)
+			return -ENOMEM;
+
+		oid->name = external_name;
+		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+		oid->name_len = len;
+	}
+
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+	if (oid->name != oid->inline_name)
+		kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
 
 /*
  * calculate file layout from given offset, length.
@@ -1474,7 +1565,7 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
 				     oid->name_len);
 
-	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
 	     pg_out->pool, pg_out->seed);
 	return 0;
 }
-- 
cgit v1.2.3


From 0c0a8de13f9612a663b050afa955e6668858d1eb Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:21 +0200
Subject: libceph: nuke unused fields and functions

Either unused or useless:

    osdmap->mkfs_epoch
    osd->o_marked_for_keepalive
    monc->num_generic_requests
    osdc->map_waiters
    osdc->last_requested_map
    osdc->timeout_tid

    osd_req_op_cls_response_data()

    osdmap_apply_incremental() @msgr arg

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |  1 -
 include/linux/ceph/osd_client.h |  8 --------
 include/linux/ceph/osdmap.h     |  6 ++----
 net/ceph/mon_client.c           |  3 ---
 net/ceph/osd_client.c           | 13 +------------
 net/ceph/osdmap.c               |  3 +--
 6 files changed, 4 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..330d045e4092 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -77,7 +77,6 @@ struct ceph_mon_client {
 
 	/* pending generic requests */
 	struct rb_root generic_request_tree;
-	int num_generic_requests;
 	u64 last_tid;
 
 	/* subs, indexed with CEPH_SUB_* */
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 66a1fcd5bff7..63854a8df183 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -37,11 +37,9 @@ struct ceph_osd {
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
-	int o_marked_for_keepalive;
 	struct list_head o_keepalive_item;
 };
 
-
 #define CEPH_OSD_SLAB_OPS	2
 #define CEPH_OSD_MAX_OPS	16
 
@@ -206,13 +204,10 @@ struct ceph_osd_client {
 
 	struct ceph_osdmap     *osdmap;       /* current map */
 	struct rw_semaphore    map_sem;
-	struct completion      map_waiters;
-	u64                    last_requested_map;
 
 	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
-	u64                    timeout_tid;   /* tid of timeout triggering rq */
 	u64                    last_tid;      /* tid of last request */
 	struct rb_root         requests;      /* pending requests */
 	struct list_head       req_lru;	      /* in-flight lru */
@@ -271,9 +266,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
 					struct ceph_osd_request *osd_req,
 					unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
-					struct ceph_osd_request *osd_req,
-					unsigned int which);
 
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
 					unsigned int which,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 777a29412706..ce7a41a182d4 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -123,7 +123,6 @@ struct ceph_pg_mapping {
 struct ceph_osdmap {
 	struct ceph_fsid fsid;
 	u32 epoch;
-	u32 mkfs_epoch;
 	struct ceph_timespec created, modified;
 
 	u32 flags;         /* CEPH_OSDMAP_* */
@@ -205,9 +204,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 }
 
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					    struct ceph_osdmap *map,
-					    struct ceph_messenger *msgr);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
 /* calculate mapping of a file extent to an object */
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..3dfafdad92aa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -579,7 +579,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 	req->tid = tid != 0 ? tid : ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
 	__insert_generic_request(monc, req);
-	monc->num_generic_requests++;
 	ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 
@@ -587,7 +586,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 
 	mutex_lock(&monc->mutex);
 	rb_erase(&req->node, &monc->generic_request_tree);
-	monc->num_generic_requests--;
 
 	if (!err)
 		err = req->result;
@@ -914,7 +912,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 	monc->generic_request_tree = RB_ROOT;
-	monc->num_generic_requests = 0;
 	monc->last_tid = 0;
 
 	return 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 95910aed8e2e..32ba09be6ee6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -143,14 +143,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-			unsigned int which)
-{
-	return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
-
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
 			unsigned int which, struct page **pages,
 			u64 length, u32 alignment,
@@ -2166,8 +2158,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
 			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap,
-							  &osdc->client->msgr);
+							  osdc->osdmap);
 			if (IS_ERR(newmap)) {
 				err = PTR_ERR(newmap);
 				goto bad;
@@ -2674,8 +2665,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->client = client;
 	osdc->osdmap = NULL;
 	init_rwsem(&osdc->map_sem);
-	init_completion(&osdc->map_waiters);
-	osdc->last_requested_map = 0;
 	mutex_init(&osdc->request_mutex);
 	osdc->last_tid = 0;
 	osdc->osds = RB_ROOT;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4668b871ca47..9a0cc072a909 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1204,8 +1204,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  * decode and apply an incremental map update.
  */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					     struct ceph_osdmap *map,
-					     struct ceph_messenger *msgr)
+					     struct ceph_osdmap *map)
 {
 	struct crush_map *newcrush = NULL;
 	struct ceph_fsid fsid;
-- 
cgit v1.2.3


From fcd00b68bbe2bf5606cb45c2cd4a250a390bcc1f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: DEFINE_RB_FUNCS macro

Given

    struct foo {
        u64 id;
        struct rb_node bar_node;
    };

generate insert_bar(), erase_bar() and lookup_bar() functions with

    DEFINE_RB_FUNCS(bar, struct foo, id, bar_node)

The key is assumed to be an integer (u64, int, etc), compared with
< and >.  nodefld has to be initialized with RB_CLEAR_NODE().

Start using it for MDS, MON and OSD requests and OSD sessions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 54 ++++++------------------
 include/linux/ceph/libceph.h | 57 ++++++++++++++++++++++++++
 net/ceph/mon_client.c        | 52 ++++--------------------
 net/ceph/osd_client.c        | 97 +++++---------------------------------------
 4 files changed, 88 insertions(+), 172 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..cff85af425d4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -567,51 +567,23 @@ void ceph_mdsc_release_request(struct kref *kref)
 	kfree(req);
 }
 
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
 /*
  * lookup session, bump ref if found.
  *
  * called under mdsc->mutex.
  */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
-					     u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
 	struct ceph_mds_request *req;
-	struct rb_node *n = mdsc->request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mds_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else {
-			ceph_mdsc_get_request(req);
-			return req;
-		}
-	}
-	return NULL;
-}
 
-static void __insert_request(struct ceph_mds_client *mdsc,
-			     struct ceph_mds_request *new)
-{
-	struct rb_node **p = &mdsc->request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mds_request *req = NULL;
+	req = lookup_request(&mdsc->request_tree, tid);
+	if (req)
+		ceph_mdsc_get_request(req);
 
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mds_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &mdsc->request_tree);
+	return req;
 }
 
 /*
@@ -630,7 +602,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 				  req->r_num_caps);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
-	__insert_request(mdsc, req);
+	insert_request(&mdsc->request_tree, req);
 
 	req->r_uid = current_fsuid();
 	req->r_gid = current_fsgid();
@@ -663,8 +635,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		}
 	}
 
-	rb_erase(&req->r_node, &mdsc->request_tree);
-	RB_CLEAR_NODE(&req->r_node);
+	erase_request(&mdsc->request_tree, req);
 
 	if (req->r_unsafe_dir && req->r_got_unsafe) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -1722,6 +1693,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	INIT_LIST_HEAD(&req->r_unsafe_target_item);
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
+	RB_CLEAR_NODE(&req->r_node);
 	INIT_LIST_HEAD(&req->r_wait);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
@@ -2414,7 +2386,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	/* get request, session */
 	tid = le64_to_cpu(msg->hdr.tid);
 	mutex_lock(&mdsc->mutex);
-	req = __lookup_request(mdsc, tid);
+	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		dout("handle_reply on unknown tid %llu\n", tid);
 		mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2576,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	fwd_seq = ceph_decode_32(&p);
 
 	mutex_lock(&mdsc->mutex);
-	req = __lookup_request(mdsc, tid);
+	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
 		goto out;  /* dup reply? */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
 		(off >> PAGE_SHIFT);
 }
 
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)		\
+static void insert_##name(struct rb_root *root, type *t)		\
+{									\
+	struct rb_node **n = &root->rb_node;				\
+	struct rb_node *parent = NULL;					\
+									\
+	BUG_ON(!RB_EMPTY_NODE(&t->nodefld));				\
+									\
+	while (*n) {							\
+		type *cur = rb_entry(*n, type, nodefld);		\
+									\
+		parent = *n;						\
+		if (t->keyfld < cur->keyfld)				\
+			n = &(*n)->rb_left;				\
+		else if (t->keyfld > cur->keyfld)			\
+			n = &(*n)->rb_right;				\
+		else							\
+			BUG();						\
+	}								\
+									\
+	rb_link_node(&t->nodefld, parent, n);				\
+	rb_insert_color(&t->nodefld, root);				\
+}									\
+static void erase_##name(struct rb_root *root, type *t)			\
+{									\
+	BUG_ON(RB_EMPTY_NODE(&t->nodefld));				\
+	rb_erase(&t->nodefld, root);					\
+	RB_CLEAR_NODE(&t->nodefld);					\
+}
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)		\
+static type *lookup_##name(struct rb_root *root,			\
+			   typeof(((type *)0)->keyfld) key)		\
+{									\
+	struct rb_node *n = root->rb_node;				\
+									\
+	while (n) {							\
+		type *cur = rb_entry(n, type, nodefld);			\
+									\
+		if (key < cur->keyfld)					\
+			n = n->rb_left;					\
+		else if (key > cur->keyfld)				\
+			n = n->rb_right;				\
+		else							\
+			return cur;					\
+	}								\
+									\
+	return NULL;							\
+}
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)			\
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)			\
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 3dfafdad92aa..a426a4b03e75 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -478,45 +478,7 @@ out:
 /*
  * generic requests (currently statfs, mon_get_version)
  */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-	struct ceph_mon_client *monc, u64 tid)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *n = monc->generic_request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mon_generic_request, node);
-		if (tid < req->tid)
-			n = n->rb_left;
-		else if (tid > req->tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-			    struct ceph_mon_generic_request *new)
-{
-	struct rb_node **p = &monc->generic_request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mon_generic_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
-		if (new->tid < req->tid)
-			p = &(*p)->rb_left;
-		else if (new->tid > req->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 
 static void release_generic_request(struct kref *kref)
 {
@@ -551,7 +513,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	struct ceph_msg *m;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (!req) {
 		dout("get_generic_reply %lld dne\n", tid);
 		*skip = 1;
@@ -578,14 +540,14 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 	/* register request */
 	req->tid = tid != 0 ? tid : ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_generic_request(monc, req);
+	insert_generic_request(&monc->generic_request_tree, req);
 	ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 
 	err = wait_for_completion_interruptible(&req->completion);
 
 	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->generic_request_tree);
+	erase_generic_request(&monc->generic_request_tree, req);
 
 	if (!err)
 		err = req->result;
@@ -619,7 +581,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (req) {
 		*(struct ceph_statfs *)req->buf = reply->st;
 		req->result = 0;
@@ -651,6 +613,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 		return -ENOMEM;
 
 	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
 	req->buf = buf;
 	init_completion(&req->completion);
 
@@ -696,7 +659,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 		goto bad;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, handle);
+	req = lookup_generic_request(&monc->generic_request_tree, handle);
 	if (req) {
 		*(u64 *)req->buf = ceph_decode_64(&p);
 		req->result = 0;
@@ -732,6 +695,7 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 		return -ENOMEM;
 
 	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
 	req->buf = newest;
 	init_completion(&req->completion);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c423e11d6857..8256051ed88f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -875,45 +875,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-static void __insert_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *new)
-{
-	struct rb_node **p = &osdc->requests.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_osd_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-						 u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
 
 static struct ceph_osd_request *
 __lookup_request_ge(struct ceph_osd_client *osdc,
@@ -1101,6 +1063,8 @@ static void put_osd(struct ceph_osd *osd)
 	}
 }
 
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
+
 /*
  * remove an osd from our map
  */
@@ -1111,8 +1075,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	WARN_ON(!list_empty(&osd->o_linger_requests));
 
 	list_del_init(&osd->o_osd_lru);
-	rb_erase(&osd->o_node, &osdc->osds);
-	RB_CLEAR_NODE(&osd->o_node);
+	erase_osd(&osdc->osds, osd);
 }
 
 static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
@@ -1188,45 +1151,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-	struct rb_node **p = &osdc->osds.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd *osd = NULL;
-
-	dout("__insert_osd %p osd%d\n", new, new->o_osd);
-	while (*p) {
-		parent = *p;
-		osd = rb_entry(parent, struct ceph_osd, o_node);
-		if (new->o_osd < osd->o_osd)
-			p = &(*p)->rb_left;
-		else if (new->o_osd > osd->o_osd)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->o_node, parent, p);
-	rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-	struct ceph_osd *osd;
-	struct rb_node *n = osdc->osds.rb_node;
-
-	while (n) {
-		osd = rb_entry(n, struct ceph_osd, o_node);
-		if (o < osd->o_osd)
-			n = n->rb_left;
-		else if (o > osd->o_osd)
-			n = n->rb_right;
-		else
-			return osd;
-	}
-	return NULL;
-}
-
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
@@ -1248,7 +1172,7 @@ static void __register_request(struct ceph_osd_client *osdc,
 	req->r_tid = ++osdc->last_tid;
 	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	__insert_request(osdc, req);
+	insert_request(&osdc->requests, req);
 	ceph_osdc_get_request(req);
 	osdc->num_requests++;
 	if (osdc->num_requests == 1) {
@@ -1270,8 +1194,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 	}
 
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	rb_erase(&req->r_node, &osdc->requests);
-	RB_CLEAR_NODE(&req->r_node);
+	erase_request(&osdc->requests, req);
 	osdc->num_requests--;
 
 	if (req->r_osd) {
@@ -1482,7 +1405,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = __lookup_osd(osdc, o);
+	req->r_osd = lookup_osd(&osdc->osds, o);
 	if (!req->r_osd && o >= 0) {
 		err = -ENOMEM;
 		req->r_osd = create_osd(osdc, o);
@@ -1492,7 +1415,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 		}
 
 		dout("map_request osd %p is osd%d\n", req->r_osd, o);
-		__insert_osd(osdc, req->r_osd);
+		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
 			      CEPH_ENTITY_TYPE_OSD, o,
@@ -1822,7 +1745,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	/* lookup */
 	down_read(&osdc->map_sem);
 	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	req = lookup_request(&osdc->requests, tid);
 	if (req == NULL) {
 		dout("handle_reply tid %llu dne\n", tid);
 		goto bad_mutex;
@@ -2880,7 +2803,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 
 	tid = le64_to_cpu(hdr->tid);
 	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	req = lookup_request(&osdc->requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-- 
cgit v1.2.3


From 985c1673885b77b2e0167c6478a833817d1e2fe5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: fix ceph_eversion encoding

eversion_t is version+epoch in userspace and is encoded in that order.
ceph_eversion is defined as epoch+version in rados.h, yet we memcpy it
in __send_request().  Reoder ceph_eversion fields.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/rados.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..913c87c26d33 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
  * compound epoch+version, used by storage layer to serialize mutations
  */
 struct ceph_eversion {
-	__le32 epoch;
 	__le64 version;
+	__le32 epoch;
 } __attribute__ ((packed));
 
 /*
-- 
cgit v1.2.3


From d9591f5e28686277d9312d3c7422faf1368b305e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: rename ceph_oloc_oid_to_pg()

Rename ceph_oloc_oid_to_pg() to ceph_object_locator_to_pg().  Emphasise
that returned is raw PG and return -ENOENT instead of -EIO if the pool
doesn't exist.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c             |  2 +-
 include/linux/ceph/osdmap.h |  9 ++++-----
 net/ceph/osd_client.c       |  4 ++--
 net/ceph/osdmap.c           | 31 ++++++++++++++++---------------
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index db296709784a..cca7fff22725 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -215,7 +215,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
 	ceph_oid_printf(&oid, "%s", dl.object_name);
 
-	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
 	if (r < 0) {
 		up_read(&osdc->map_sem);
 		return r;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index ce7a41a182d4..b70440c05b49 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -213,11 +213,10 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
-/* calculate mapping of object to a placement group */
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			       struct ceph_object_locator *oloc,
-			       struct ceph_object_id *oid,
-			       struct ceph_pg *pg_out);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid);
 
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
 			       struct ceph_pg pgid,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8256051ed88f..cb9f1953f5fb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1324,8 +1324,8 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 		/* !pi is caught in ceph_oloc_oid_to_pg() */
 	}
 
-	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
-				   &req->r_target_oid, pg_out);
+	return ceph_object_locator_to_pg(osdmap, &req->r_target_oid,
+					 &req->r_target_oloc, pg_out);
 }
 
 static void __enqueue_request(struct ceph_osd_request *req)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9a0cc072a909..6267839cb246 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1545,30 +1545,31 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			struct ceph_object_locator *oloc,
-			struct ceph_object_id *oid,
-			struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid)
 {
 	struct ceph_pg_pool_info *pi;
 
-	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
 	if (!pi)
-		return -EIO;
+		return -ENOENT;
 
-	pg_out->pool = oloc->pool;
-	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-				     oid->name_len);
+	raw_pgid->pool = oloc->pool;
+	raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+				       oid->name_len);
 
-	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-	     pg_out->pool, pg_out->seed);
+	dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
+	     oid->name, raw_pgid->pool, raw_pgid->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
-- 
cgit v1.2.3


From 6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: ceph_osds, ceph_pg_to_up_acting_osds()

Knowning just acting set isn't enough, we need to be able to record up
set as well to detect interval changes.  This means returning (up[],
up_len, up_primary, acting[], acting_len, acting_primary) and passing
it around.  Introduce and switch to ceph_osds to help with that.

Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return
both up and acting sets from it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  21 ++-
 net/ceph/osd_client.c       |  36 +++---
 net/ceph/osdmap.c           | 304 ++++++++++++++++++++++++++------------------
 3 files changed, 215 insertions(+), 146 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index b70440c05b49..942189d311e0 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -208,6 +208,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
+struct ceph_osds {
+	int osds[CEPH_PG_MAX_SIZE];
+	int size;
+	int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+	set->size = 0;
+	set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
@@ -218,9 +232,10 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 			      struct ceph_object_locator *oloc,
 			      struct ceph_pg *raw_pgid);
 
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
-			       struct ceph_pg pgid,
-			       int *osds, int *primary);
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       const struct ceph_pg *raw_pgid,
+			       struct ceph_osds *up,
+			       struct ceph_osds *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cb9f1953f5fb..0ff400a56cd6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1358,8 +1358,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
 	struct ceph_pg pgid;
-	int acting[CEPH_PG_MAX_SIZE];
-	int num, o;
+	struct ceph_osds up, acting;
 	int err;
 	bool was_paused;
 
@@ -1372,9 +1371,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 	}
 	req->r_pgid = pgid;
 
-	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-	if (num < 0)
-		num = 0;
+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
 
 	was_paused = req->r_paused;
 	req->r_paused = __req_should_be_paused(osdc, req);
@@ -1382,21 +1379,23 @@ static int __map_request(struct ceph_osd_client *osdc,
 		force_resend = 1;
 
 	if ((!force_resend &&
-	     req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_osd && req->r_osd->o_osd == acting.primary &&
 	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == num &&
-	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-	    (req->r_osd == NULL && o == -1) ||
+	     req->r_num_pg_osds == acting.size &&
+	     memcmp(req->r_pg_osds, acting.osds,
+		    acting.size * sizeof(acting.osds[0])) == 0) ||
+	    (req->r_osd == NULL && acting.primary == -1) ||
 	    req->r_paused)
 		return 0;  /* no change */
 
 	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, pgid.pool, pgid.seed, o,
+	     req->r_tid, pgid.pool, pgid.seed, acting.primary,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
 	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-	req->r_num_pg_osds = num;
+	memcpy(req->r_pg_osds, acting.osds,
+	       acting.size * sizeof(acting.osds[0]));
+	req->r_num_pg_osds = acting.size;
 
 	if (req->r_osd) {
 		__cancel_request(req);
@@ -1405,21 +1404,22 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = lookup_osd(&osdc->osds, o);
-	if (!req->r_osd && o >= 0) {
+	req->r_osd = lookup_osd(&osdc->osds, acting.primary);
+	if (!req->r_osd && acting.primary >= 0) {
 		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, o);
+		req->r_osd = create_osd(osdc, acting.primary);
 		if (!req->r_osd) {
 			list_move(&req->r_req_lru_item, &osdc->req_notarget);
 			goto out;
 		}
 
-		dout("map_request osd %p is osd%d\n", req->r_osd, o);
+		dout("map_request osd %p is osd%d\n", req->r_osd,
+		     acting.primary);
 		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, o,
-			      &osdc->osdmap->osd_addr[o]);
+			      CEPH_ENTITY_TYPE_OSD, acting.primary,
+			      &osdc->osdmap->osd_addr[acting.primary]);
 	}
 
 	__enqueue_request(req);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 6267839cb246..f5fc8fc63879 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1474,6 +1474,38 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
 }
 EXPORT_SYMBOL(ceph_oid_destroy);
 
+static bool osds_valid(const struct ceph_osds *set)
+{
+	/* non-empty set */
+	if (set->size > 0 && set->primary >= 0)
+		return true;
+
+	/* empty can_shift_osds set */
+	if (!set->size && set->primary == -1)
+		return true;
+
+	/* empty !can_shift_osds set - all NONE */
+	if (set->size > 0 && set->primary == -1) {
+		int i;
+
+		for (i = 0; i < set->size; i++) {
+			if (set->osds[i] != CRUSH_ITEM_NONE)
+				break;
+		}
+		if (i == set->size)
+			return true;
+	}
+
+	return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+	memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+	dest->size = src->size;
+	dest->primary = src->primary;
+}
+
 /*
  * calculate file layout from given offset, length.
  * fill in correct oid, logical length, and object extent
@@ -1571,6 +1603,46 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 }
 EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+			 const struct ceph_pg *raw_pgid,
+			 struct ceph_pg *pgid)
+{
+	pgid->pool = raw_pgid->pool;
+	pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+				     pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+			 const struct ceph_pg *raw_pgid)
+{
+	if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+		/* hash pool id and seed so that pool PGs do not overlap */
+		return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+				      ceph_stable_mod(raw_pgid->seed,
+						      pi->pgp_num,
+						      pi->pgp_num_mask),
+				      raw_pgid->pool);
+	} else {
+		/*
+		 * legacy behavior: add ps and pool together.  this is
+		 * not a great approach because the PGs from each pool
+		 * will overlap on top of each other: 0.5 == 1.4 ==
+		 * 2.3 == ...
+		 */
+		return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+				       pi->pgp_num_mask) +
+		       (unsigned)raw_pgid->pool;
+	}
+}
+
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
 		    const __u32 *weight, int weight_max)
@@ -1588,84 +1660,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
  *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
  */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
-			  struct ceph_pg_pool_info *pool,
-			  struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+			   struct ceph_pg_pool_info *pi,
+			   const struct ceph_pg *raw_pgid,
+			   struct ceph_osds *raw,
+			   u32 *ppps)
 {
+	u32 pps = raw_pg_to_pps(pi, raw_pgid);
 	int ruleno;
 	int len;
 
-	/* crush */
-	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-				 pool->type, pool->size);
+	ceph_osds_init(raw);
+	if (ppps)
+		*ppps = pps;
+
+	ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+				 pi->size);
 	if (ruleno < 0) {
 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-		       pgid.pool, pool->crush_ruleset, pool->type,
-		       pool->size);
-		return -ENOENT;
+		       pi->id, pi->crush_ruleset, pi->type, pi->size);
+		return;
 	}
 
-	len = do_crush(osdmap, ruleno, pps, osds,
-		       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+	len = do_crush(osdmap, ruleno, pps, raw->osds,
+		       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
 		       osdmap->osd_weight, osdmap->max_osd);
 	if (len < 0) {
 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-		       len, ruleno, pgid.pool, pool->crush_ruleset,
-		       pool->type, pool->size);
-		return len;
+		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+		       pi->size);
+		return;
 	}
 
-	return len;
+	raw->size = len;
 }
 
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
  *
- * Return up set length.  *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set.  If it's
+ * empty, ->primary will remain undefined.
  */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
-			  struct ceph_pg_pool_info *pool,
-			  int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+			   struct ceph_pg_pool_info *pi,
+			   struct ceph_osds *set)
 {
-	int up_primary = -1;
 	int i;
 
-	if (ceph_can_shift_osds(pool)) {
+	/* ->primary is undefined for a raw set */
+	BUG_ON(set->primary != -1);
+
+	if (ceph_can_shift_osds(pi)) {
 		int removed = 0;
 
-		for (i = 0; i < len; i++) {
-			if (ceph_osd_is_down(osdmap, osds[i])) {
+		/* shift left */
+		for (i = 0; i < set->size; i++) {
+			if (ceph_osd_is_down(osdmap, set->osds[i])) {
 				removed++;
 				continue;
 			}
 			if (removed)
-				osds[i - removed] = osds[i];
+				set->osds[i - removed] = set->osds[i];
 		}
-
-		len -= removed;
-		if (len > 0)
-			up_primary = osds[0];
+		set->size -= removed;
+		if (set->size > 0)
+			set->primary = set->osds[0];
 	} else {
-		for (i = len - 1; i >= 0; i--) {
-			if (ceph_osd_is_down(osdmap, osds[i]))
-				osds[i] = CRUSH_ITEM_NONE;
+		/* set down/dne devices to NONE */
+		for (i = set->size - 1; i >= 0; i--) {
+			if (ceph_osd_is_down(osdmap, set->osds[i]))
+				set->osds[i] = CRUSH_ITEM_NONE;
 			else
-				up_primary = osds[i];
+				set->primary = set->osds[i];
 		}
 	}
-
-	*primary = up_primary;
-	return len;
 }
 
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
-				   struct ceph_pg_pool_info *pool,
-				   int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+				   struct ceph_pg_pool_info *pi,
+				   u32 pps,
+				   struct ceph_osds *up)
 {
 	int i;
 	int pos = -1;
@@ -1677,8 +1757,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	if (!osdmap->osd_primary_affinity)
 		return;
 
-	for (i = 0; i < len; i++) {
-		int osd = osds[i];
+	for (i = 0; i < up->size; i++) {
+		int osd = up->osds[i];
 
 		if (osd != CRUSH_ITEM_NONE &&
 		    osdmap->osd_primary_affinity[osd] !=
@@ -1686,7 +1766,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 			break;
 		}
 	}
-	if (i == len)
+	if (i == up->size)
 		return;
 
 	/*
@@ -1694,8 +1774,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	 * osd into the hash/rng so that a proportional fraction of an
 	 * osd's pgs get rejected as primary.
 	 */
-	for (i = 0; i < len; i++) {
-		int osd = osds[i];
+	for (i = 0; i < up->size; i++) {
+		int osd = up->osds[i];
 		u32 aff;
 
 		if (osd == CRUSH_ITEM_NONE)
@@ -1720,123 +1800,99 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	if (pos < 0)
 		return;
 
-	*primary = osds[pos];
+	up->primary = up->osds[pos];
 
-	if (ceph_can_shift_osds(pool) && pos > 0) {
+	if (ceph_can_shift_osds(pi) && pos > 0) {
 		/* move the new primary to the front */
 		for (i = pos; i > 0; i--)
-			osds[i] = osds[i - 1];
-		osds[0] = *primary;
+			up->osds[i] = up->osds[i - 1];
+		up->osds[0] = up->primary;
 	}
 }
 
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
  *
- * Return acting set length.  *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
  */
-static int apply_temps(struct ceph_osdmap *osdmap,
-		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
-		       int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+			  struct ceph_pg_pool_info *pi,
+			  const struct ceph_pg *raw_pgid,
+			  struct ceph_osds *temp)
 {
+	struct ceph_pg pgid;
 	struct ceph_pg_mapping *pg;
-	int temp_len;
-	int temp_primary;
 	int i;
 
-	/* raw_pg -> pg */
-	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-				    pool->pg_num_mask);
+	raw_pg_to_pg(pi, raw_pgid, &pgid);
+	ceph_osds_init(temp);
 
 	/* pg_temp? */
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
-		temp_len = 0;
-		temp_primary = -1;
-
 		for (i = 0; i < pg->pg_temp.len; i++) {
 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-				if (ceph_can_shift_osds(pool))
+				if (ceph_can_shift_osds(pi))
 					continue;
-				else
-					osds[temp_len++] = CRUSH_ITEM_NONE;
+
+				temp->osds[temp->size++] = CRUSH_ITEM_NONE;
 			} else {
-				osds[temp_len++] = pg->pg_temp.osds[i];
+				temp->osds[temp->size++] = pg->pg_temp.osds[i];
 			}
 		}
 
 		/* apply pg_temp's primary */
-		for (i = 0; i < temp_len; i++) {
-			if (osds[i] != CRUSH_ITEM_NONE) {
-				temp_primary = osds[i];
+		for (i = 0; i < temp->size; i++) {
+			if (temp->osds[i] != CRUSH_ITEM_NONE) {
+				temp->primary = temp->osds[i];
 				break;
 			}
 		}
-	} else {
-		temp_len = len;
-		temp_primary = *primary;
 	}
 
 	/* primary_temp? */
 	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
 	if (pg)
-		temp_primary = pg->primary_temp.osd;
-
-	*primary = temp_primary;
-	return temp_len;
+		temp->primary = pg->primary_temp.osd;
 }
 
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
  *
- * Return acting set length, or error.  *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       const struct ceph_pg *raw_pgid,
+			       struct ceph_osds *up,
+			       struct ceph_osds *acting)
 {
-	struct ceph_pg_pool_info *pool;
+	struct ceph_pg_pool_info *pi;
 	u32 pps;
-	int len;
 
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-	if (!pool) {
-		*primary = -1;
-		return -ENOENT;
+	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+	if (!pi) {
+		ceph_osds_init(up);
+		ceph_osds_init(acting);
+		goto out;
 	}
 
-	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-		/* hash pool id and seed so that pool PGs do not overlap */
-		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-				     ceph_stable_mod(pgid.seed, pool->pgp_num,
-						     pool->pgp_num_mask),
-				     pgid.pool);
-	} else {
-		/*
-		 * legacy behavior: add ps and pool together.  this is
-		 * not a great approach because the PGs from each pool
-		 * will overlap on top of each other: 0.5 == 1.4 ==
-		 * 2.3 == ...
-		 */
-		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-				      pool->pgp_num_mask) +
-			(unsigned)pgid.pool;
-	}
-
-	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-	if (len < 0) {
-		*primary = -1;
-		return len;
+	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+	raw_to_up_osds(osdmap, pi, up);
+	apply_primary_affinity(osdmap, pi, pps, up);
+	get_temp_osds(osdmap, pi, raw_pgid, acting);
+	if (!acting->size) {
+		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+		acting->size = up->size;
+		if (acting->primary == -1)
+			acting->primary = up->primary;
 	}
-
-	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
-	apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
-	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
-	return len;
+out:
+	WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
 /*
@@ -1844,11 +1900,9 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-	int osds[CEPH_PG_MAX_SIZE];
-	int primary;
-
-	ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+	struct ceph_osds up, acting;
 
-	return primary;
+	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	return acting.primary;
 }
 EXPORT_SYMBOL(ceph_calc_pg_primary);
-- 
cgit v1.2.3


From f81f16339a05775df600b2ff75a79be1864975c1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: rename ceph_calc_pg_primary()

Rename ceph_calc_pg_primary() to ceph_pg_to_acting_primary() to
emphasise that it returns acting primary.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c             | 2 +-
 include/linux/ceph/osdmap.h | 4 ++--
 net/ceph/osdmap.c           | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index cca7fff22725..1831ad6cf066 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -221,7 +221,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return r;
 	}
 
-	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
 			ceph_osd_addr(osdc->osdmap, dl.osd);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 942189d311e0..3fd978a1639b 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -236,8 +236,8 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 			       const struct ceph_pg *raw_pgid,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid);
 
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
 						    u64 id);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f5fc8fc63879..656384a8fd1e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1896,13 +1896,14 @@ out:
 }
 
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid)
 {
 	struct ceph_osds up, acting;
 
-	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
 	return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
-- 
cgit v1.2.3


From f984cb76cc5fb9fc76d6abb6c4694a5412e3f49b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: make pgid_cmp() global

calc_target() code is going to need to know how to compare PGs.  Take
lhs and rhs pgid by const * while at it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  2 ++
 net/ceph/osdmap.c           | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 3fd978a1639b..7783237ab06c 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,6 +24,8 @@ struct ceph_pg {
 	uint32_t seed;
 };
 
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+
 #define CEPH_POOL_FLAG_HASHPSPOOL  1
 
 struct ceph_pg_pool_info {
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 656384a8fd1e..3c7dc5e581ab 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
 	return ERR_PTR(err);
 }
 
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
 {
-	if (l.pool < r.pool)
+	if (lhs->pool < rhs->pool)
 		return -1;
-	if (l.pool > r.pool)
+	if (lhs->pool > rhs->pool)
 		return 1;
-	if (l.seed < r.seed)
+	if (lhs->seed < rhs->seed)
 		return -1;
-	if (l.seed > r.seed)
+	if (lhs->seed > rhs->seed)
 		return 1;
+
 	return 0;
 }
 
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 			       struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 	while (*p) {
 		parent = *p;
 		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		c = pgid_cmp(new->pgid, pg->pgid);
+		c = ceph_pg_compare(&new->pgid, &pg->pgid);
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
 	while (n) {
 		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = pgid_cmp(pgid, pg->pgid);
+		c = ceph_pg_compare(&pgid, &pg->pgid);
 		if (c < 0) {
 			n = n->rb_left;
 		} else if (c > 0) {
-- 
cgit v1.2.3


From 04812acf572ef41fd51c11e0bf3385f34c0e1b5b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: pi->min_size, pi->last_force_request_resend

Add and decode pi->min_size and pi->last_force_request_resend.  These
are going to be used by calc_target().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  9 ++++++---
 net/ceph/debugfs.c          | 10 ++++++----
 net/ceph/osdmap.c           | 48 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 7783237ab06c..989294d0b8d2 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -26,20 +26,23 @@ struct ceph_pg {
 
 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 
-#define CEPH_POOL_FLAG_HASHPSPOOL  1
+#define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
+						       together */
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
 	s64 id;
-	u8 type;
+	u8 type; /* CEPH_POOL_TYPE_* */
 	u8 size;
+	u8 min_size;
 	u8 crush_ruleset;
 	u8 object_hash;
+	u32 last_force_request_resend;
 	u32 pg_num, pgp_num;
 	int pg_num_mask, pgp_num_mask;
 	s64 read_tier;
 	s64 write_tier; /* wins for read+write ops */
-	u64 flags;
+	u64 flags; /* CEPH_POOL_FLAG_* */
 	char *name;
 };
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6f8413293d15..7f1cc22c3e8b 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -66,12 +66,14 @@ static int osdmap_show(struct seq_file *s, void *p)
 		   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
 
 	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-		struct ceph_pg_pool_info *pool =
+		struct ceph_pg_pool_info *pi =
 			rb_entry(n, struct ceph_pg_pool_info, node);
 
-		seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
-			   pool->id, pool->pg_num, pool->pg_num_mask,
-			   pool->read_tier, pool->write_tier);
+		seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+			   pi->id, pi->name, pi->type, pi->size, pi->min_size,
+			   pi->pg_num, pi->pg_num_mask, pi->flags,
+			   pi->last_force_request_resend, pi->read_tier,
+			   pi->write_tier);
 	}
 	for (i = 0; i < map->max_osd; i++) {
 		struct ceph_entity_addr *addr = &map->osd_addr[i];
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 3c7dc5e581ab..66c3ebead92f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -597,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 	*p += 4;  /* skip crash_replay_interval */
 
 	if (ev >= 7)
-		*p += 1;  /* skip min_size */
+		pi->min_size = ceph_decode_8(p);
+	else
+		pi->min_size = pi->size - pi->size / 2;
 
 	if (ev >= 8)
 		*p += 8 + 8;  /* skip quota_max_* */
@@ -617,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 		pi->write_tier = -1;
 	}
 
+	if (ev >= 10) {
+		/* skip properties */
+		num = ceph_decode_32(p);
+		while (num--) {
+			len = ceph_decode_32(p);
+			*p += len; /* key */
+			len = ceph_decode_32(p);
+			*p += len; /* val */
+		}
+	}
+
+	if (ev >= 11) {
+		/* skip hit_set_params */
+		*p += 1 + 1; /* versions */
+		len = ceph_decode_32(p);
+		*p += len;
+
+		*p += 4; /* skip hit_set_period */
+		*p += 4; /* skip hit_set_count */
+	}
+
+	if (ev >= 12)
+		*p += 4; /* skip stripe_width */
+
+	if (ev >= 13) {
+		*p += 8; /* skip target_max_bytes */
+		*p += 8; /* skip target_max_objects */
+		*p += 4; /* skip cache_target_dirty_ratio_micro */
+		*p += 4; /* skip cache_target_full_ratio_micro */
+		*p += 4; /* skip cache_min_flush_age */
+		*p += 4; /* skip cache_min_evict_age */
+	}
+
+	if (ev >=  14) {
+		/* skip erasure_code_profile */
+		len = ceph_decode_32(p);
+		*p += len;
+	}
+
+	if (ev >= 15)
+		pi->last_force_request_resend = ceph_decode_32(p);
+	else
+		pi->last_force_request_resend = 0;
+
 	/* ignore the rest */
 
 	*p = pool_end;
-- 
cgit v1.2.3


From 63244fa123a755e4bbaee03022b68613c71d1332 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: introduce ceph_osd_request_target, calc_target()

Introduce ceph_osd_request_target, containing all mapping-related
fields of ceph_osd_request and calc_target() for calculating mappings
and populating it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |   2 +-
 fs/ceph/file.c                  |   2 +-
 include/linux/ceph/osd_client.h |  23 ++++++
 include/linux/ceph/osdmap.h     |  34 +++++++++
 include/linux/ceph/rados.h      |   5 ++
 net/ceph/osd_client.c           | 157 +++++++++++++++++++++++++++++++++++++++-
 net/ceph/osdmap.c               | 121 +++++++++++++++++++++++++++++++
 7 files changed, 340 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6f28dd9bacb2..c5d75486823b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-	wr_req->r_base_oloc.pool = pool;
+	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
 	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
 
 	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 9d470397e249..36b4a41dfa67 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
-	req->r_base_oloc = orig_req->r_base_oloc;
+	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
 	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 63854a8df183..48806ee4488d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
 				     struct ceph_msg *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
+#define CEPH_HOMELESS_OSD	-1
+
 /* a given osd we're communicating with */
 struct ceph_osd {
 	atomic_t o_ref;
@@ -118,6 +120,27 @@ struct ceph_osd_req_op {
 	};
 };
 
+struct ceph_osd_request_target {
+	struct ceph_object_id base_oid;
+	struct ceph_object_locator base_oloc;
+	struct ceph_object_id target_oid;
+	struct ceph_object_locator target_oloc;
+
+	struct ceph_pg pgid;
+	u32 pg_num;
+	u32 pg_num_mask;
+	struct ceph_osds acting;
+	struct ceph_osds up;
+	int size;
+	int min_size;
+	bool sort_bitwise;
+
+	unsigned int flags;                /* CEPH_OSD_FLAG_* */
+	bool paused;
+
+	int osd;
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 989294d0b8d2..420bb7968b25 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 
 #define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
 						       together */
+#define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
@@ -62,6 +63,22 @@ struct ceph_object_locator {
 	s64 pool;
 };
 
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+	oloc->pool = -1;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+	return oloc->pool == -1;
+}
+
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+				  const struct ceph_object_locator *src)
+{
+	dest->pool = src->pool;
+}
+
 /*
  * Maximum supported by kernel client object name length
  *
@@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set)
 
 void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
 
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 913c87c26d33..f28ed864e682 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 
 /*
  * The error code to return when an OSD can't handle a write
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0ff400a56cd6..cff3a7e29233 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -298,6 +298,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	}
 }
 
+/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+	ceph_oid_init(&t->base_oid);
+	ceph_oloc_init(&t->base_oloc);
+	ceph_oid_init(&t->target_oid);
+	ceph_oloc_init(&t->target_oloc);
+
+	ceph_osds_init(&t->acting);
+	ceph_osds_init(&t->up);
+	t->size = -1;
+	t->min_size = -1;
+
+	t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+	ceph_oid_destroy(&t->base_oid);
+	ceph_oid_destroy(&t->target_oid);
+}
+
 /*
  * requests
  */
@@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 
+static bool __pool_full(struct ceph_pg_pool_info *pi)
+{
+	return pi->flags & CEPH_POOL_FLAG_FULL;
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
@@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc,
 		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
 }
 
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+				    const struct ceph_osd_request_target *t,
+				    struct ceph_pg_pool_info *pi)
+{
+	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		       __pool_full(pi);
+
+	WARN_ON(pi->id != t->base_oloc.pool);
+	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
 /*
  * Calculate mapping of a request to a PG.  Takes tiering into account.
  */
@@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 					 &req->r_target_oloc, pg_out);
 }
 
+enum calc_target_result {
+	CALC_TARGET_NO_ACTION = 0,
+	CALC_TARGET_NEED_RESEND,
+	CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+					   struct ceph_osd_request_target *t,
+					   u32 *last_force_resend,
+					   bool any_change)
+{
+	struct ceph_pg_pool_info *pi;
+	struct ceph_pg pgid, last_pgid;
+	struct ceph_osds up, acting;
+	bool force_resend = false;
+	bool need_check_tiering = false;
+	bool need_resend = false;
+	bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+					     CEPH_OSDMAP_SORTBITWISE);
+	enum calc_target_result ct_res;
+	int ret;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+	if (!pi) {
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+
+	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+		if (last_force_resend &&
+		    *last_force_resend < pi->last_force_request_resend) {
+			*last_force_resend = pi->last_force_request_resend;
+			force_resend = true;
+		} else if (!last_force_resend) {
+			force_resend = true;
+		}
+	}
+	if (ceph_oid_empty(&t->target_oid) || force_resend) {
+		ceph_oid_copy(&t->target_oid, &t->base_oid);
+		need_check_tiering = true;
+	}
+	if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+		ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+		need_check_tiering = true;
+	}
+
+	if (need_check_tiering &&
+	    (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+			t->target_oloc.pool = pi->read_tier;
+		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+			t->target_oloc.pool = pi->write_tier;
+	}
+
+	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+					&t->target_oloc, &pgid);
+	if (ret) {
+		WARN_ON(ret != -ENOENT);
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+	last_pgid.pool = pgid.pool;
+	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+	if (any_change &&
+	    ceph_is_new_interval(&t->acting,
+				 &acting,
+				 &t->up,
+				 &up,
+				 t->size,
+				 pi->size,
+				 t->min_size,
+				 pi->min_size,
+				 t->pg_num,
+				 pi->pg_num,
+				 t->sort_bitwise,
+				 sort_bitwise,
+				 &last_pgid))
+		force_resend = true;
+
+	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+		t->paused = false;
+		need_resend = true;
+	}
+
+	if (ceph_pg_compare(&t->pgid, &pgid) ||
+	    ceph_osds_changed(&t->acting, &acting, any_change) ||
+	    force_resend) {
+		t->pgid = pgid; /* struct */
+		ceph_osds_copy(&t->acting, &acting);
+		ceph_osds_copy(&t->up, &up);
+		t->size = pi->size;
+		t->min_size = pi->min_size;
+		t->pg_num = pi->pg_num;
+		t->pg_num_mask = pi->pg_num_mask;
+		t->sort_bitwise = sort_bitwise;
+
+		t->osd = acting.primary;
+		need_resend = true;
+	}
+
+	ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+	dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+	return ct_res;
+}
+
 static void __enqueue_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
@@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		redir.oloc.pool = -1;
 	}
 
-	if (redir.oloc.pool != -1) {
+	if (!ceph_oloc_empty(&redir.oloc)) {
 		dout("redirect pool %lld\n", redir.oloc.pool);
 
 		__unregister_request(osdc, req);
 
-		req->r_target_oloc = redir.oloc; /* struct */
+		ceph_oloc_copy(&req->r_target_oloc, &redir.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 66c3ebead92f..7d4a5b43085e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
 }
 EXPORT_SYMBOL(ceph_oid_destroy);
 
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+			 const struct ceph_osds *rhs)
+{
+	if (lhs->size == rhs->size &&
+	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+		return true;
+
+	return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+		       const struct ceph_osds *rhs)
+{
+	if (__osds_equal(lhs, rhs) &&
+	    lhs->primary == rhs->primary)
+		return true;
+
+	return false;
+}
+
 static bool osds_valid(const struct ceph_osds *set)
 {
 	/* non-empty set */
@@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
 	dest->primary = src->primary;
 }
 
+static bool is_split(const struct ceph_pg *pgid,
+		     u32 old_pg_num,
+		     u32 new_pg_num)
+{
+	int old_bits = calc_bits_of(old_pg_num);
+	int old_mask = (1 << old_bits) - 1;
+	int n;
+
+	WARN_ON(pgid->seed >= old_pg_num);
+	if (new_pg_num <= old_pg_num)
+		return false;
+
+	for (n = 1; ; n++) {
+		int next_bit = n << (old_bits - 1);
+		u32 s = next_bit | pgid->seed;
+
+		if (s < old_pg_num || s == pgid->seed)
+			continue;
+		if (s >= new_pg_num)
+			break;
+
+		s = ceph_stable_mod(s, old_pg_num, old_mask);
+		if (s == pgid->seed)
+			return true;
+	}
+
+	return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid)
+{
+	return !osds_equal(old_acting, new_acting) ||
+	       !osds_equal(old_up, new_up) ||
+	       old_size != new_size ||
+	       old_min_size != new_min_size ||
+	       is_split(pgid, old_pg_num, new_pg_num) ||
+	       old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+	int i;
+
+	for (i = 0; i < acting->size; i++) {
+		if (acting->osds[i] == osd)
+			return i;
+	}
+
+	return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+			    const struct ceph_osds *new_acting)
+{
+	if (!old_acting->size && !new_acting->size)
+		return false; /* both still empty */
+
+	if (!old_acting->size ^ !new_acting->size)
+		return true; /* was empty, now not, or vice versa */
+
+	if (old_acting->primary != new_acting->primary)
+		return true; /* primary changed */
+
+	if (calc_pg_rank(old_acting->primary, old_acting) !=
+	    calc_pg_rank(new_acting->primary, new_acting))
+		return true;
+
+	return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change)
+{
+	if (primary_changed(old_acting, new_acting))
+		return true;
+
+	if (any_change && !__osds_equal(old_acting, new_acting))
+		return true;
+
+	return false;
+}
+
 /*
  * calculate file layout from given offset, length.
  * fill in correct oid, logical length, and object extent
-- 
cgit v1.2.3


From a66dd38309f5d9c66ec9bc7911ff8da8cc37bb9f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: switch to calc_target(), part 1

Replace __calc_request_pg() and most of __map_request() with
calc_target() and start using req->r_t.

ceph_osdc_build_request() however still encodes base_oid, because it's
called before calc_target() is and target_oid is empty at that point in
time; a printf in osdc_show() also shows base_oid.  This is fixed in
"libceph: switch to calc_target(), part 2".

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  15 ++---
 net/ceph/debugfs.c              |   2 +-
 net/ceph/osd_client.c           | 119 ++++++++--------------------------------
 3 files changed, 29 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 48806ee4488d..03bf9d9e1517 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -150,12 +150,13 @@ struct ceph_osd_request {
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
-	struct ceph_pg   r_pgid;
-	int              r_pg_osds[CEPH_PG_MAX_SIZE];
-	int              r_num_pg_osds;
+
+	struct ceph_osd_request_target r_t;
+#define r_base_oid	r_t.base_oid
+#define r_base_oloc	r_t.base_oloc
+#define r_flags		r_t.flags
 
 	struct ceph_msg  *r_request, *r_reply;
-	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
 
 	/* request osd ops array  */
@@ -167,7 +168,6 @@ struct ceph_osd_request {
 	__le64           *r_request_pool;
 	void             *r_request_pgid;
 	__le32           *r_request_attempts;
-	bool              r_paused;
 	struct ceph_eversion *r_request_reassert_version;
 
 	int               r_result;
@@ -186,11 +186,6 @@ struct ceph_osd_request {
 	struct inode *r_inode;         	      /* for use by callbacks */
 	void *r_priv;			      /* ditto */
 
-	struct ceph_object_locator r_base_oloc;
-	struct ceph_object_id r_base_oid;
-	struct ceph_object_locator r_target_oloc;
-	struct ceph_object_id r_target_oid;
-
 	u64               r_snapid;
 	unsigned long     r_stamp;            /* send OR check time */
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 7f1cc22c3e8b..0c11ab5f8c30 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 
 		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
 			   req->r_osd ? req->r_osd->o_osd : -1,
-			   req->r_pgid.pool, req->r_pgid.seed);
+			   req->r_t.pgid.pool, req->r_t.pgid.seed);
 
 		seq_printf(s, "%*pE", req->r_base_oid.name_len,
 			   req->r_base_oid.name);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cff3a7e29233..013101598c41 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -350,8 +350,7 @@ static void ceph_osdc_release_request(struct kref *kref)
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
 
-	ceph_oid_destroy(&req->r_base_oid);
-	ceph_oid_destroy(&req->r_target_oid);
+	target_destroy(&req->r_t);
 	ceph_put_snap_context(req->r_snapc);
 
 	if (req->r_mempool)
@@ -420,10 +419,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
-	ceph_oid_init(&req->r_base_oid);
-	req->r_base_oloc.pool = -1;
-	ceph_oid_init(&req->r_target_oid);
-	req->r_target_oloc.pool = -1;
+	target_init(&req->r_t);
 
 	dout("%s req %p\n", __func__, req);
 	return req;
@@ -1308,16 +1304,6 @@ static bool __pool_full(struct ceph_pg_pool_info *pi)
  *
  * Caller should hold map_sem for read.
  */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req)
-{
-	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-	return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
-		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
-}
-
 static bool target_should_be_paused(struct ceph_osd_client *osdc,
 				    const struct ceph_osd_request_target *t,
 				    struct ceph_pg_pool_info *pi)
@@ -1332,45 +1318,6 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
 }
 
-/*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
- */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
-			     struct ceph_osd_request *req,
-			     struct ceph_pg *pg_out)
-{
-	bool need_check_tiering;
-
-	need_check_tiering = false;
-	if (req->r_target_oloc.pool == -1) {
-		req->r_target_oloc = req->r_base_oloc; /* struct */
-		need_check_tiering = true;
-	}
-	if (ceph_oid_empty(&req->r_target_oid)) {
-		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
-		need_check_tiering = true;
-	}
-
-	if (need_check_tiering &&
-	    (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-		struct ceph_pg_pool_info *pi;
-
-		pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
-		if (pi) {
-			if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-			    pi->read_tier >= 0)
-				req->r_target_oloc.pool = pi->read_tier;
-			if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-			    pi->write_tier >= 0)
-				req->r_target_oloc.pool = pi->write_tier;
-		}
-		/* !pi is caught in ceph_oloc_oid_to_pg() */
-	}
-
-	return ceph_object_locator_to_pg(osdmap, &req->r_target_oid,
-					 &req->r_target_oloc, pg_out);
-}
-
 enum calc_target_result {
 	CALC_TARGET_NO_ACTION = 0,
 	CALC_TARGET_NEED_RESEND,
@@ -1510,46 +1457,26 @@ static void __enqueue_request(struct ceph_osd_request *req)
 static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
-	struct ceph_pg pgid;
-	struct ceph_osds up, acting;
+	enum calc_target_result ct_res;
 	int err;
-	bool was_paused;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
 
-	err = __calc_request_pg(osdc->osdmap, req, &pgid);
-	if (err) {
+	ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
+	switch (ct_res) {
+	case CALC_TARGET_POOL_DNE:
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
-		return err;
-	}
-	req->r_pgid = pgid;
-
-	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
-
-	was_paused = req->r_paused;
-	req->r_paused = __req_should_be_paused(osdc, req);
-	if (was_paused && !req->r_paused)
-		force_resend = 1;
-
-	if ((!force_resend &&
-	     req->r_osd && req->r_osd->o_osd == acting.primary &&
-	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == acting.size &&
-	     memcmp(req->r_pg_osds, acting.osds,
-		    acting.size * sizeof(acting.osds[0])) == 0) ||
-	    (req->r_osd == NULL && acting.primary == -1) ||
-	    req->r_paused)
+		return -EIO;
+	case CALC_TARGET_NO_ACTION:
 		return 0;  /* no change */
+	default:
+		BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
+	}
 
 	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, pgid.pool, pgid.seed, acting.primary,
+	     req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
-	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting.osds,
-	       acting.size * sizeof(acting.osds[0]));
-	req->r_num_pg_osds = acting.size;
-
 	if (req->r_osd) {
 		__cancel_request(req);
 		list_del_init(&req->r_osd_item);
@@ -1557,22 +1484,22 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = lookup_osd(&osdc->osds, acting.primary);
-	if (!req->r_osd && acting.primary >= 0) {
+	req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
+	if (!req->r_osd && req->r_t.osd >= 0) {
 		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, acting.primary);
+		req->r_osd = create_osd(osdc, req->r_t.osd);
 		if (!req->r_osd) {
 			list_move(&req->r_req_lru_item, &osdc->req_notarget);
 			goto out;
 		}
 
 		dout("map_request osd %p is osd%d\n", req->r_osd,
-		     acting.primary);
+		     req->r_osd->o_osd);
 		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, acting.primary,
-			      &osdc->osdmap->osd_addr[acting.primary]);
+			      CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
+			      &osdc->osdmap->osd_addr[req->r_osd->o_osd]);
 	}
 
 	__enqueue_request(req);
@@ -1592,15 +1519,15 @@ static void __send_request(struct ceph_osd_client *osdc,
 
 	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
 	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+	     req->r_t.pgid.pool, req->r_t.pgid.seed);
 
 	/* fill in message content that changes each time we send it */
 	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
 	put_unaligned_le32(req->r_flags, req->r_request_flags);
-	put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
+	put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool);
 	p = req->r_request_pgid;
-	ceph_encode_64(&p, req->r_pgid.pool);
-	ceph_encode_32(&p, req->r_pgid.seed);
+	ceph_encode_64(&p, req->r_t.pgid.pool);
+	ceph_encode_32(&p, req->r_t.pgid.seed);
 	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
 	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
 	       sizeof(req->r_reassert_version));
@@ -1963,7 +1890,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 		__unregister_request(osdc, req);
 
-		ceph_oloc_copy(&req->r_target_oloc, &redir.oloc);
+		ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
-- 
cgit v1.2.3


From bb873b539154ab51893430b4ad6ba4051775276a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 00:29:52 +0200
Subject: libceph: switch to calc_target(), part 2

The crux of this is getting rid of ceph_osdc_build_request(), so that
MOSDOp can be encoded not before but after calc_target() calculates the
actual target.  Encoding now happens within ceph_osdc_start_request().

Also nuked is the accompanying bunch of pointers into the encoded
buffer that was used to update fields on each send - instead, the
entire front is re-encoded.  If we want to support target->name_len !=
base->name_len in the future, there is no other way, because oid is
surrounded by other fields in the encoded buffer.

Encoding OSD ops and adding data items to the request message were
mixed together in osd_req_encode_op().  While we want to re-encode OSD
ops, we don't want to add duplicate data items to the message when
resending, so all call to ceph_osdc_msg_data_add() are factored out
into a new setup_request_data().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  18 +-
 fs/ceph/addr.c                  |  16 +-
 fs/ceph/file.c                  |  16 +-
 include/linux/ceph/osd_client.h |  29 ++--
 include/linux/ceph/rados.h      |   7 +
 net/ceph/debugfs.c              |  61 ++++---
 net/ceph/osd_client.c           | 355 ++++++++++++++++++++--------------------
 7 files changed, 247 insertions(+), 255 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f3ea927f93de..0e598916e048 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1896,27 +1896,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
 	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
-	u64 snap_id;
-
-	rbd_assert(osd_req != NULL);
 
-	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-	ceph_osdc_build_request(osd_req, obj_request->offset,
-			NULL, snap_id, NULL);
+	if (img_request)
+		osd_req->r_snapid = img_request->snap_id;
 }
 
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
-	struct ceph_snap_context *snapc;
-	struct timespec mtime = CURRENT_TIME;
-
-	rbd_assert(osd_req != NULL);
 
-	snapc = img_request ? img_request->snapc : NULL;
-	ceph_osdc_build_request(osd_req, obj_request->offset,
-			snapc, CEPH_NOSNAP, &mtime);
+	osd_req->r_mtime = CURRENT_TIME;
+	osd_req->r_data_offset = obj_request->offset;
 }
 
 /*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c5d75486823b..59b3c3fbd3bd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
 	ret = ceph_osdc_start_request(osdc, req, false);
 	if (ret < 0)
@@ -1063,10 +1061,7 @@ new_request:
 			pages = NULL;
 		}
 
-		vino = ceph_vino(inode);
-		ceph_osdc_build_request(req, offset, snapc, vino.snap,
-					&inode->i_mtime);
-
+		req->r_mtime = inode->i_mtime;
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
 		req = NULL;
@@ -1614,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 		goto out;
 	}
 
-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1652,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 			goto out_put;
 	}
 
-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1790,12 +1785,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
 	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
 				     0, false, true);
-	ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-				&ci->vfs_inode.i_mtime);
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-	ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
-				&ci->vfs_inode.i_mtime);
+	wr_req->r_mtime = ci->vfs_inode.i_mtime;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 36b4a41dfa67..52e4b72dd5de 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -727,8 +727,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
-	ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
-				snapc, CEPH_NOSNAP, &aio_req->mtime);
+	req->r_mtime = aio_req->mtime;
+	req->r_data_offset = req->r_ops[0].extent.offset;
 
 	ceph_osdc_put_request(orig_req);
 
@@ -882,14 +882,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					(pos+len) | (PAGE_SIZE - 1));
 
 			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+			req->r_mtime = mtime;
 		}
 
-
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
 						 false, false);
 
-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
 		if (aio_req) {
 			aio_req->total_len += len;
 			aio_req->num_reqs++;
@@ -1074,9 +1072,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
 						false, true);
 
-		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+		req->r_mtime = mtime;
 		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 		if (!ret)
 			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1532,9 +1528,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
-				&inode->i_mtime);
-
+	req->r_mtime = inode->i_mtime;
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03bf9d9e1517..67a37d98e0ca 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -104,7 +104,7 @@ struct ceph_osd_req_op {
 			struct ceph_osd_data response_data;
 			__u8 class_len;
 			__u8 method_len;
-			__u8 argc;
+			u32 indata_len;
 		} cls;
 		struct {
 			u64 cookie;
@@ -162,14 +162,6 @@ struct ceph_osd_request {
 	/* request osd ops array  */
 	unsigned int		r_num_ops;
 
-	/* these are updated on each send */
-	__le32           *r_request_osdmap_epoch;
-	__le32           *r_request_flags;
-	__le64           *r_request_pool;
-	void             *r_request_pgid;
-	__le32           *r_request_attempts;
-	struct ceph_eversion *r_request_reassert_version;
-
 	int               r_result;
 	int               r_got_reply;
 	int		  r_linger;
@@ -180,16 +172,22 @@ struct ceph_osd_request {
 	struct completion r_completion, r_safe_completion;
 	ceph_osdc_callback_t r_callback;
 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
-	struct ceph_eversion r_reassert_version;
 	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
 	void *r_priv;			      /* ditto */
 
-	u64               r_snapid;
-	unsigned long     r_stamp;            /* send OR check time */
+	/* set by submitter */
+	u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
+	struct ceph_snap_context *r_snapc;    /* for writes */
+	struct timespec r_mtime;              /* ditto */
+	u64 r_data_offset;                    /* ditto */
 
-	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	/* internal */
+	unsigned long r_stamp;                /* jiffies, send or check time */
+	int r_attempts;
+	struct ceph_eversion r_replay_version; /* aka reassert_version */
+	u32 r_last_force_resend;
 
 	struct ceph_osd_req_op r_ops[];
 };
@@ -334,11 +332,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       gfp_t gfp_flags);
 int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				    struct ceph_snap_context *snapc,
-				    u64 snap_id,
-				    struct timespec *mtime);
-
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index f28ed864e682..28740a58f32c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -394,6 +394,13 @@ enum {
 	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
 	CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
 	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+	CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+	CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+						      pool uses pool snaps */
+	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 
 enum {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 0c11ab5f8c30..6d3ff713edeb 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -145,6 +145,43 @@ static int monc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
+{
+	int i;
+
+	seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+	for (i = 0; i < t->up.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+	seq_printf(s, "]/%d\t[", t->up.primary);
+	for (i = 0; i < t->acting.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+	seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+		   t->target_oid.name_len, t->target_oid.name, t->flags);
+	if (t->paused)
+		seq_puts(s, "\tP");
+}
+
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+	int i;
+
+	seq_printf(s, "%llu\t", req->r_tid);
+	dump_target(s, &req->r_t);
+
+	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+		   le32_to_cpu(req->r_replay_version.epoch),
+		   le64_to_cpu(req->r_replay_version.version));
+
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+			   ceph_osd_op_name(op->op));
+	}
+
+	seq_putc(s, '\n');
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -154,32 +191,10 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
-		unsigned int i;
-		int opcode;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
-		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   req->r_t.pgid.pool, req->r_t.pgid.seed);
-
-		seq_printf(s, "%*pE", req->r_base_oid.name_len,
-			   req->r_base_oid.name);
-
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
-
-		for (i = 0; i < req->r_num_ops; i++) {
-			opcode = req->r_ops[i].op;
-			seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-				   ceph_osd_op_name(opcode));
-		}
-
-		seq_printf(s, "\n");
+		dump_request(s, req);
 	}
 	mutex_unlock(&osdc->request_mutex);
 	return 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 013101598c41..8a008f083283 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -34,8 +34,6 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
 					struct ceph_osd_request *req);
 static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req);
 
 /*
  * Implement client access to distributed object storage cluster.
@@ -209,6 +207,8 @@ void osd_req_op_cls_request_data_pagelist(
 
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
+	osd_req->r_ops[which].cls.indata_len += pagelist->length;
+	osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
@@ -221,6 +221,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
+	osd_req->r_ops[which].cls.indata_len += length;
+	osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 
@@ -610,8 +612,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
-	op->cls.argc = 0;	/* currently unused */
-
 	op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -709,16 +709,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 	}
 }
 
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+			     const struct ceph_osd_req_op *src)
 {
-	struct ceph_osd_req_op *src;
-	struct ceph_osd_data *osd_data;
-	u64 request_data_len = 0;
-	u64 data_length;
-
-	BUG_ON(which >= req->r_num_ops);
-	src = &req->r_ops[which];
 	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
 		pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -727,49 +720,23 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 	switch (src->op) {
 	case CEPH_OSD_OP_STAT:
-		osd_data = &src->raw_data_in;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_TRUNCATE:
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			request_data_len = src->extent.length;
 		dst->extent.offset = cpu_to_le64(src->extent.offset);
 		dst->extent.length = cpu_to_le64(src->extent.length);
 		dst->extent.truncate_size =
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
-		osd_data = &src->extent.osd_data;
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-		else
-			ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
-		osd_data = &src->cls.request_info;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-		request_data_len = osd_data->pagelist->length;
-
-		osd_data = &src->cls.request_data;
-		data_length = ceph_osd_data_length(osd_data);
-		if (data_length) {
-			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-			dst->cls.indata_len = cpu_to_le32(data_length);
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-			src->indata_len += data_length;
-			request_data_len += data_length;
-		}
-		osd_data = &src->cls.response_data;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -791,9 +758,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
 		dst->xattr.cmp_op = src->xattr.cmp_op;
 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		osd_data = &src->xattr.osd_data;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		request_data_len = osd_data->pagelist->length;
 		break;
 	case CEPH_OSD_OP_CREATE:
 	case CEPH_OSD_OP_DELETE:
@@ -810,7 +774,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 	dst->flags = cpu_to_le32(src->flags);
 	dst->payload_len = cpu_to_le32(src->indata_len);
 
-	return request_data_len;
+	return src->indata_len;
 }
 
 /*
@@ -852,8 +816,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		goto fail;
 	}
 
-	req->r_flags = flags;
-
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
 	if (r)
@@ -877,9 +839,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
+	req->r_flags = flags;
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
 	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
+	req->r_snapid = vino.snap;
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		req->r_data_offset = off;
+
 	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (r)
 		goto fail;
@@ -1509,37 +1476,173 @@ out:
 	return err;
 }
 
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
+static void setup_request_data(struct ceph_osd_request *req,
+			       struct ceph_msg *msg)
 {
-	void *p;
+	u32 data_len = 0;
+	int i;
+
+	if (!list_empty(&msg->data))
+		return;
 
-	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-	     req->r_t.pgid.pool, req->r_t.pgid.seed);
+	WARN_ON(msg->data_length);
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		switch (op->op) {
+		/* request */
+		case CEPH_OSD_OP_WRITE:
+		case CEPH_OSD_OP_WRITEFULL:
+			WARN_ON(op->indata_len != op->extent.length);
+			ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+			break;
+		case CEPH_OSD_OP_SETXATTR:
+		case CEPH_OSD_OP_CMPXATTR:
+			WARN_ON(op->indata_len != op->xattr.name_len +
+						  op->xattr.value_len);
+			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+			break;
+
+		/* reply */
+		case CEPH_OSD_OP_STAT:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->raw_data_in);
+			break;
+		case CEPH_OSD_OP_READ:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->extent.osd_data);
+			break;
+
+		/* both */
+		case CEPH_OSD_OP_CALL:
+			WARN_ON(op->indata_len != op->cls.class_len +
+						  op->cls.method_len +
+						  op->cls.indata_len);
+			ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->cls.response_data);
+			break;
+		}
+
+		data_len += op->indata_len;
+	}
 
-	/* fill in message content that changes each time we send it */
-	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
-	put_unaligned_le32(req->r_flags, req->r_request_flags);
-	put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool);
-	p = req->r_request_pgid;
+	WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front_alloc_len;
+	u32 data_len = 0;
+	int i;
+
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+		/* snapshots aren't writeable */
+		WARN_ON(req->r_snapid != CEPH_NOSNAP);
+	} else {
+		WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+			req->r_data_offset || req->r_snapc);
+	}
+
+	setup_request_data(req, msg);
+
+	ceph_encode_32(&p, 1); /* client_inc, always 1 */
+	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+	ceph_encode_32(&p, req->r_flags);
+	ceph_encode_timespec(p, &req->r_mtime);
+	p += sizeof(struct ceph_timespec);
+	/* aka reassert_version */
+	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+	p += sizeof(req->r_replay_version);
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	ceph_encode_64(&p, req->r_t.target_oloc.pool);
+	ceph_encode_32(&p, -1); /* preferred */
+	ceph_encode_32(&p, 0); /* key len */
+
+	/* pgid */
+	ceph_encode_8(&p, 1);
 	ceph_encode_64(&p, req->r_t.pgid.pool);
 	ceph_encode_32(&p, req->r_t.pgid.seed);
-	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-	       sizeof(req->r_reassert_version));
+	ceph_encode_32(&p, -1); /* preferred */
 
-	req->r_stamp = jiffies;
-	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+	/* oid */
+	ceph_encode_32(&p, req->r_t.target_oid.name_len);
+	memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+	p += req->r_t.target_oid.name_len;
 
-	ceph_msg_get(req->r_request); /* send consumes a ref */
+	/* ops, can imply data */
+	ceph_encode_16(&p, req->r_num_ops);
+	for (i = 0; i < req->r_num_ops; i++) {
+		data_len += osd_req_encode_op(p, &req->r_ops[i]);
+		p += sizeof(struct ceph_osd_op);
+	}
 
-	req->r_sent = req->r_osd->o_incarnation;
+	ceph_encode_64(&p, req->r_snapid); /* snapid */
+	if (req->r_snapc) {
+		ceph_encode_64(&p, req->r_snapc->seq);
+		ceph_encode_32(&p, req->r_snapc->num_snaps);
+		for (i = 0; i < req->r_snapc->num_snaps; i++)
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
+	} else {
+		ceph_encode_64(&p, 0); /* snap_seq */
+		ceph_encode_32(&p, 0); /* snaps len */
+	}
+
+	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	msg->hdr.data_len = cpu_to_le32(data_len);
+	/*
+	 * The header "data_off" is a hint to the receiver allowing it
+	 * to align received data into its buffers such that there's no
+	 * need to re-copy it before writing it to disk (direct I/O).
+	 */
+	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
 
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
+	     req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
+	     req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
+}
+
+/*
+ * @req has to be assigned a tid and registered.
+ */
+static void send_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd *osd = req->r_osd;
+
+	WARN_ON(osd->o_osd != req->r_t.osd);
+
+	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+	if (req->r_attempts)
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+	else
+		WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+	encode_request(req, req->r_request);
+
+	dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+	     req->r_t.osd, req->r_flags, req->r_attempts);
+
+	req->r_t.paused = false;
+	req->r_stamp = jiffies;
+	req->r_attempts++;
+
+	req->r_sent = osd->o_incarnation;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
 }
 
 /*
@@ -1550,8 +1653,10 @@ static void __send_queued(struct ceph_osd_client *osdc)
 	struct ceph_osd_request *req, *tmp;
 
 	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
-		__send_request(osdc, req);
+	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
+		list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+		send_request(req);
+	}
 }
 
 /*
@@ -1915,8 +2020,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			req->r_result = bytes;
 
 		/* in case this is a write and we need to replay, */
-		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_reassert_version.version = cpu_to_le64(reassert_version);
+		req->r_replay_version.epoch = cpu_to_le32(reassert_epoch);
+		req->r_replay_version.version = cpu_to_le64(reassert_version);
 
 		req->r_got_reply = 1;
 	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -2432,105 +2537,6 @@ bad:
 	pr_err("osdc handle_watch_notify corrupt msg\n");
 }
 
-/*
- * build new request AND message
- *
- */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				struct ceph_snap_context *snapc, u64 snap_id,
-				struct timespec *mtime)
-{
-	struct ceph_msg *msg = req->r_request;
-	void *p;
-	size_t msg_size;
-	int flags = req->r_flags;
-	u64 data_len;
-	unsigned int i;
-
-	req->r_snapid = snap_id;
-	WARN_ON(snapc != req->r_snapc);
-
-	/* encode request */
-	msg->hdr.version = cpu_to_le16(4);
-
-	p = msg->front.iov_base;
-	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-	req->r_request_osdmap_epoch = p;
-	p += 4;
-	req->r_request_flags = p;
-	p += 4;
-	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(p, mtime);
-	p += sizeof(struct ceph_timespec);
-	req->r_request_reassert_version = p;
-	p += sizeof(struct ceph_eversion); /* will get filled in */
-
-	/* oloc */
-	ceph_encode_8(&p, 4);
-	ceph_encode_8(&p, 4);
-	ceph_encode_32(&p, 8 + 4 + 4);
-	req->r_request_pool = p;
-	p += 8;
-	ceph_encode_32(&p, -1);  /* preferred */
-	ceph_encode_32(&p, 0);   /* key len */
-
-	ceph_encode_8(&p, 1);
-	req->r_request_pgid = p;
-	p += 8 + 4;
-	ceph_encode_32(&p, -1);  /* preferred */
-
-	/* oid */
-	ceph_encode_32(&p, req->r_base_oid.name_len);
-	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid %*pE len %d\n", req->r_base_oid.name_len,
-	     req->r_base_oid.name, req->r_base_oid.name_len);
-	p += req->r_base_oid.name_len;
-
-	/* ops--can imply data */
-	ceph_encode_16(&p, (u16)req->r_num_ops);
-	data_len = 0;
-	for (i = 0; i < req->r_num_ops; i++) {
-		data_len += osd_req_encode_op(req, p, i);
-		p += sizeof(struct ceph_osd_op);
-	}
-
-	/* snaps */
-	ceph_encode_64(&p, req->r_snapid);
-	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-	if (req->r_snapc) {
-		for (i = 0; i < req->r_snapc->num_snaps; i++) {
-			ceph_encode_64(&p, req->r_snapc->snaps[i]);
-		}
-	}
-
-	req->r_request_attempts = p;
-	p += 4;
-
-	/* data */
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		u16 data_off;
-
-		/*
-		 * The header "data_off" is a hint to the receiver
-		 * allowing it to align received data into its
-		 * buffers such that there's no need to re-copy
-		 * it before writing it to disk (direct I/O).
-		 */
-		data_off = (u16) (off & 0xffff);
-		req->r_request->hdr.data_off = cpu_to_le16(data_off);
-	}
-	req->r_request->hdr.data_len = cpu_to_le32(data_len);
-
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
-
-	dout("build_request msg_size was %d\n", (int)msg_size);
-}
-EXPORT_SYMBOL(ceph_osdc_build_request);
-
 /*
  * Register request, send initial attempt.
  */
@@ -2749,15 +2755,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-
 	osd_req_op_extent_osd_data_pages(req, 0,
 				pages, *plen, page_align, false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
 	     off, *plen, *plen, page_align);
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
@@ -2783,7 +2786,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
-	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2797,8 +2799,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+	req->r_mtime = *mtime;
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
-- 
cgit v1.2.3


From 85e084feb47349d62989efe1713a8723af95f4ea Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:24 +0200
Subject: libceph: drop msg argument from ceph_osdc_callback_t

finish_read(), its only user, uses it to get to hdr.data_len, which is
what ->r_result is set to on success.  This gains us the ability to
safely call callbacks from contexts other than reply, e.g. map check.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             | 5 ++---
 fs/ceph/addr.c                  | 9 ++++-----
 fs/ceph/file.c                  | 7 +++----
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 4 ++--
 5 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0e598916e048..82b03aa509e6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1828,13 +1828,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
 		obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
-				struct ceph_msg *msg)
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 {
 	struct rbd_obj_request *obj_request = osd_req->r_priv;
 	u16 opcode;
 
-	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+	dout("%s: osd_req %p\n", __func__, osd_req);
 	rbd_assert(osd_req == obj_request->osd_req);
 	if (obj_request_img_data_test(obj_request)) {
 		rbd_assert(obj_request->img_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 59b3c3fbd3bd..a11756a39471 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
  * Finish an async read(ahead) op.
  */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_osd_data *osd_data;
-	int rc = req->r_result;
-	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int rc = req->r_result <= 0 ? req->r_result : 0;
+	int bytes = req->r_result >= 0 ? req->r_result : 0;
 	int num_pages;
 	int i;
 
@@ -598,8 +598,7 @@ static void ceph_release_pages(struct page **pages, int num)
  * If we get an error, set the mapping error bit, but not the individual
  * page error bits.
  */
-static void writepages_finish(struct ceph_osd_request *req,
-			      struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 52e4b72dd5de..e75fd0b028e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -616,8 +616,7 @@ static void ceph_aio_complete(struct inode *inode,
 	kfree(aio_req);
 }
 
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
-				  struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
 {
 	int rc = req->r_result;
 	struct inode *inode = req->r_inode;
@@ -740,7 +739,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
 	if (ret < 0) {
 		req->r_result = ret;
-		ceph_aio_complete_req(req, NULL);
+		ceph_aio_complete_req(req);
 	}
 
 	ceph_put_snap_context(snapc);
@@ -961,7 +960,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 							      req, false);
 			if (ret < 0) {
 				req->r_result = ret;
-				ceph_aio_complete_req(req, NULL);
+				ceph_aio_complete_req(req);
 			}
 		}
 		return -EIOCBQUEUED;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 67a37d98e0ca..3bebd60e7f9f 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,8 +20,7 @@ struct ceph_osd_client;
 /*
  * completion callback for async writepages
  */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-				     struct ceph_msg *);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD	-1
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8a008f083283..2a30c0bb3045 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2048,7 +2048,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
 			req->r_unsafe_callback(req, true);
 		if (req->r_callback)
-			req->r_callback(req, msg);
+			req->r_callback(req);
 		else
 			complete_all(&req->r_completion);
 	}
@@ -2072,7 +2072,7 @@ bad_put:
 	req->r_result = -EIO;
 	__unregister_request(osdc, req);
 	if (req->r_callback)
-		req->r_callback(req, msg);
+		req->r_callback(req);
 	else
 		complete_all(&req->r_completion);
 	complete_request(req);
-- 
cgit v1.2.3


From fe5da05e979830b43b115d8a18ead521d507c783 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:24 +0200
Subject: libceph: redo callbacks and factor out MOSDOpReply decoding

If you specify ACK | ONDISK and set ->r_unsafe_callback, both
->r_callback and ->r_unsafe_callback(true) are called on ack.  This is
very confusing.  Redo this so that only one of them is called:

    ->r_unsafe_callback(true), on ack
    ->r_unsafe_callback(false), on commit

or

    ->r_callback, on ack|commit

Decode everything in decode_MOSDOpReply() to reduce clutter.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |   3 +-
 fs/ceph/file.c                  |   2 +
 include/linux/ceph/osd_client.h |   5 +-
 net/ceph/osd_client.c           | 362 +++++++++++++++++++++++-----------------
 4 files changed, 215 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a11756a39471..f47418477629 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1765,8 +1765,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		goto out_unlock;
 	}
 
-	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
-			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+	wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
 	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e75fd0b028e9..30fd49eb25b4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -770,6 +770,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 		list_add_tail(&req->r_unsafe_item,
 			      &ci->i_unsafe_writes);
 		spin_unlock(&ci->i_unsafe_lock);
+
+		complete_all(&req->r_completion);
 	} else {
 		spin_lock(&ci->i_unsafe_lock);
 		list_del_init(&req->r_unsafe_item);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3bebd60e7f9f..2415dc0cb008 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -162,13 +162,14 @@ struct ceph_osd_request {
 	unsigned int		r_num_ops;
 
 	int               r_result;
-	int               r_got_reply;
+	bool              r_got_reply;
 	int		  r_linger;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
 	bool              r_mempool;
-	struct completion r_completion, r_safe_completion;
+	struct completion r_completion;
+	struct completion r_safe_completion;  /* fsync waiter */
 	ceph_osdc_callback_t r_callback;
 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
 	struct list_head  r_unsafe_item;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2a30c0bb3045..baf2844b00d6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1693,6 +1693,14 @@ static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
 	return 0;
 }
 
+static void __complete_request(struct ceph_osd_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(req);
+	else
+		complete_all(&req->r_completion);
+}
+
 /*
  * Timeout callback, called every N seconds when 1 or more osd
  * requests has been active for more than N seconds.  When this
@@ -1875,107 +1883,76 @@ e_inval:
 	goto out;
 }
 
-static void complete_request(struct ceph_osd_request *req)
-{
-	complete_all(&req->r_safe_completion);  /* fsync waiter */
-}
+struct MOSDOpReply {
+	struct ceph_pg pgid;
+	u64 flags;
+	int result;
+	u32 epoch;
+	int num_ops;
+	u32 outdata_len[CEPH_OSD_MAX_OPS];
+	s32 rval[CEPH_OSD_MAX_OPS];
+	int retry_attempt;
+	struct ceph_eversion replay_version;
+	u64 user_version;
+	struct ceph_request_redirect redirect;
+};
 
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
 {
-	void *p, *end;
-	struct ceph_osd_request *req;
-	struct ceph_request_redirect redir;
-	u64 tid;
-	int object_len;
-	unsigned int numops;
-	int payload_len, flags;
-	s32 result;
-	s32 retry_attempt;
-	struct ceph_pg pg;
-	int err;
-	u32 reassert_epoch;
-	u64 reassert_version;
-	u32 osdmap_epoch;
-	int already_completed;
-	u32 bytes;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	u16 version = le16_to_cpu(msg->hdr.version);
+	struct ceph_eversion bad_replay_version;
 	u8 decode_redir;
-	unsigned int i;
-
-	tid = le64_to_cpu(msg->hdr.tid);
-	dout("handle_reply %p tid %llu\n", msg, tid);
-
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	u32 len;
+	int ret;
+	int i;
 
-	ceph_decode_need(&p, end, 4, bad);
-	object_len = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, object_len, bad);
-	p += object_len;
+	ceph_decode_32_safe(&p, end, len, e_inval);
+	ceph_decode_need(&p, end, len, e_inval);
+	p += len; /* skip oid */
 
-	err = ceph_decode_pgid(&p, end, &pg);
-	if (err)
-		goto bad;
+	ret = ceph_decode_pgid(&p, end, &m->pgid);
+	if (ret)
+		return ret;
 
-	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
-	flags = ceph_decode_64(&p);
-	result = ceph_decode_32(&p);
-	reassert_epoch = ceph_decode_32(&p);
-	reassert_version = ceph_decode_64(&p);
-	osdmap_epoch = ceph_decode_32(&p);
+	ceph_decode_64_safe(&p, end, m->flags, e_inval);
+	ceph_decode_32_safe(&p, end, m->result, e_inval);
+	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+	p += sizeof(bad_replay_version);
+	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
 
-	/* lookup */
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
-	if (req == NULL) {
-		dout("handle_reply tid %llu dne\n", tid);
-		goto bad_mutex;
-	}
-	ceph_osdc_get_request(req);
+	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+		goto e_inval;
 
-	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
-	     req, result);
-
-	ceph_decode_need(&p, end, 4, bad_put);
-	numops = ceph_decode_32(&p);
-	if (numops > CEPH_OSD_MAX_OPS)
-		goto bad_put;
-	if (numops != req->r_num_ops)
-		goto bad_put;
-	payload_len = 0;
-	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-	for (i = 0; i < numops; i++) {
+	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+			 e_inval);
+	for (i = 0; i < m->num_ops; i++) {
 		struct ceph_osd_op *op = p;
-		int len;
 
-		len = le32_to_cpu(op->payload_len);
-		req->r_ops[i].outdata_len = len;
-		dout(" op %d has %d bytes\n", i, len);
-		payload_len += len;
+		m->outdata_len[i] = le32_to_cpu(op->payload_len);
 		p += sizeof(*op);
 	}
-	bytes = le32_to_cpu(msg->hdr.data_len);
-	if (payload_len != bytes) {
-		pr_warn("sum of op payload lens %d != data_len %d\n",
-			payload_len, bytes);
-		goto bad_put;
-	}
 
-	ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
-	retry_attempt = ceph_decode_32(&p);
-	for (i = 0; i < numops; i++)
-		req->r_ops[i].rval = ceph_decode_32(&p);
+	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+	for (i = 0; i < m->num_ops; i++)
+		ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
 
-	if (le16_to_cpu(msg->hdr.version) >= 6) {
-		p += 8 + 4; /* skip replay_version */
-		p += 8; /* skip user_version */
+	if (version >= 5) {
+		ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+		memcpy(&m->replay_version, p, sizeof(m->replay_version));
+		p += sizeof(m->replay_version);
+		ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+	} else {
+		m->replay_version = bad_replay_version; /* struct */
+		m->user_version = le64_to_cpu(m->replay_version.version);
+	}
 
-		if (le16_to_cpu(msg->hdr.version) >= 7)
-			ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+	if (version >= 6) {
+		if (version >= 7)
+			ceph_decode_8_safe(&p, end, decode_redir, e_inval);
 		else
 			decode_redir = 1;
 	} else {
@@ -1983,19 +1960,96 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	}
 
 	if (decode_redir) {
-		err = ceph_redirect_decode(&p, end, &redir);
-		if (err)
-			goto bad_put;
+		ret = ceph_redirect_decode(&p, end, &m->redirect);
+		if (ret)
+			return ret;
 	} else {
-		redir.oloc.pool = -1;
+		ceph_oloc_init(&m->redirect.oloc);
 	}
 
-	if (!ceph_oloc_empty(&redir.oloc)) {
-		dout("redirect pool %lld\n", redir.oloc.pool);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+/*
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+			 const struct MOSDOpReply *m)
+{
+	return (m->result < 0 ||
+		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
+		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
 
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set?	yes			no
+ *
+ * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
+ * any or needed/got safe)	r_safe_completion	r_safe_completion
+ *
+ * first reply is unsafe	r_unsafe_cb(true)	(nothing)
+ *
+ * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
+ *				r_safe_completion	r_safe_completion
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	struct ceph_osd_request *req;
+	struct MOSDOpReply m;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 data_len = 0;
+	bool already_acked;
+	int ret;
+	int i;
+
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	req = lookup_request(&osdc->requests, tid);
+	if (!req) {
+		dout("%s no tid %llu\n", __func__, tid);
+		goto out_unlock;
+	}
+	ceph_osdc_get_request(req);
+
+	ret = decode_MOSDOpReply(msg, &m);
+	if (ret) {
+		pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+		       req->r_tid, ret);
+		ceph_msg_dump(msg);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+	     __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+	     m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+	     le64_to_cpu(m.replay_version.version), m.user_version);
+
+	if (m.retry_attempt >= 0) {
+		if (m.retry_attempt != req->r_attempts - 1) {
+			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+			     req, req->r_tid, m.retry_attempt,
+			     req->r_attempts - 1);
+			goto out_put;
+		}
+	} else {
+		WARN_ON(1); /* MOSDOpReply v4 is assumed */
+	}
+
+	if (!ceph_oloc_empty(&m.redirect.oloc)) {
+		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+		     m.redirect.oloc.pool);
 		__unregister_request(osdc, req);
 
-		ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc);
+		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
@@ -2005,85 +2059,85 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		 * successfully.  In the future we might want to follow
 		 * original request's nofail setting here.
 		 */
-		err = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(err);
+		ret = __ceph_osdc_start_request(osdc, req, true);
+		BUG_ON(ret);
 
-		goto out_unlock;
+		goto out_put;
 	}
 
-	already_completed = req->r_got_reply;
-	if (!req->r_got_reply) {
-		req->r_result = result;
-		dout("handle_reply result %d bytes %d\n", req->r_result,
-		     bytes);
-		if (req->r_result == 0)
-			req->r_result = bytes;
-
-		/* in case this is a write and we need to replay, */
-		req->r_replay_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_replay_version.version = cpu_to_le64(reassert_version);
-
-		req->r_got_reply = 1;
-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-		dout("handle_reply tid %llu dup ack\n", tid);
-		goto out_unlock;
+	if (m.num_ops != req->r_num_ops) {
+		pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+		       req->r_num_ops, req->r_tid);
+		goto fail_request;
 	}
-
-	dout("handle_reply tid %llu flags %d\n", tid, flags);
-
-	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
-		__register_linger_request(osdc, req);
-
-	/* either this is a read, or we got the safe response */
-	if (result < 0 ||
-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+	for (i = 0; i < req->r_num_ops; i++) {
+		dout(" req %p tid %llu op %d rval %d len %u\n", req,
+		     req->r_tid, i, m.rval[i], m.outdata_len[i]);
+		req->r_ops[i].rval = m.rval[i];
+		req->r_ops[i].outdata_len = m.outdata_len[i];
+		data_len += m.outdata_len[i];
+	}
+	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+		pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+	     req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+	already_acked = req->r_got_reply;
+	if (!already_acked) {
+		req->r_result = m.result ?: data_len;
+		req->r_replay_version = m.replay_version; /* struct */
+		req->r_got_reply = true;
+	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+		dout("req %p tid %llu dup ack\n", req, req->r_tid);
+		goto out_put;
+	}
+
+	if (done_request(req, &m)) {
 		__unregister_request(osdc, req);
+		if (req->r_linger) {
+			WARN_ON(req->r_unsafe_callback);
+			__register_linger_request(osdc, req);
+		}
+	}
 
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
 
-	if (!already_completed) {
-		if (req->r_unsafe_callback &&
-		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
-			req->r_unsafe_callback(req, true);
-		if (req->r_callback)
-			req->r_callback(req);
-		else
-			complete_all(&req->r_completion);
-	}
-
-	if (flags & CEPH_OSD_FLAG_ONDISK) {
-		if (req->r_unsafe_callback && already_completed)
+	if (done_request(req, &m)) {
+		if (already_acked && req->r_unsafe_callback) {
+			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
 			req->r_unsafe_callback(req, false);
-		complete_request(req);
+		} else {
+			dout("req %p tid %llu cb\n", req, req->r_tid);
+			__complete_request(req);
+		}
+	} else {
+		if (req->r_unsafe_callback) {
+			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
+			req->r_unsafe_callback(req, true);
+		} else {
+			WARN_ON(1);
+		}
 	}
+	if (m.flags & CEPH_OSD_FLAG_ONDISK)
+		complete_all(&req->r_safe_completion);
 
-out:
-	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
 	ceph_osdc_put_request(req);
 	return;
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-	goto out;
 
-bad_put:
+fail_request:
 	req->r_result = -EIO;
 	__unregister_request(osdc, req);
-	if (req->r_callback)
-		req->r_callback(req);
-	else
-		complete_all(&req->r_completion);
-	complete_request(req);
+	__complete_request(req);
+	complete_all(&req->r_safe_completion);
+out_put:
 	ceph_osdc_put_request(req);
-bad_mutex:
+out_unlock:
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
-bad:
-	pr_err("corrupt osd_op_reply got %d %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-	ceph_msg_dump(msg);
 }
 
 static void reset_changed_osds(struct ceph_osd_client *osdc)
@@ -2591,7 +2645,9 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 	if (rc < 0) {
 		dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
 		ceph_osdc_cancel_request(req);
-		complete_request(req);
+
+		/* kludge - need to to wake ceph_osdc_sync() */
+		complete_all(&req->r_safe_completion);
 		return rc;
 	}
 
-- 
cgit v1.2.3


From e5253a7bde13788d9dc75f42eb47ea119af5609f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:25 +0200
Subject: libceph: allocate dummy osdmap in ceph_osdc_init()

This leads to a simpler osdmap handling code, particularly when dealing
with pi->was_full, which is introduced in a later commit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  1 +
 net/ceph/osd_client.c       | 22 +++++++++++-----------
 net/ceph/osdmap.c           | 23 ++++++++++++++++++-----
 3 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 420bb7968b25..8468c734d712 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -225,6 +225,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 	return 0;
 }
 
+struct ceph_osdmap *ceph_osdmap_alloc(void);
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					     struct ceph_osdmap *map);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 41dabce9c9c3..9c35fd84a410 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2255,7 +2255,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	struct ceph_fsid fsid;
 	bool was_full;
 
-	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	dout("handle_map have %u\n", osdc->osdmap->epoch);
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
@@ -2278,7 +2278,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
 		next = p + maplen;
-		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+		if (osdc->osdmap->epoch+1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
 			newmap = osdmap_apply_incremental(&p, next,
@@ -2317,7 +2317,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		if (nr_maps > 1) {
 			dout("skipping non-latest full map %u len %d\n",
 			     epoch, maplen);
-		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+		} else if (osdc->osdmap->epoch >= epoch) {
 			dout("skipping full map %u len %d, "
 			     "older than our %u\n", epoch, maplen,
 			     osdc->osdmap->epoch);
@@ -2347,8 +2347,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		nr_maps--;
 	}
 
-	if (!osdc->osdmap)
-		goto bad;
 done:
 	downgrade_write(&osdc->map_sem);
 	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
@@ -2690,7 +2688,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	dout("init\n");
 	osdc->client = client;
-	osdc->osdmap = NULL;
 	init_rwsem(&osdc->map_sem);
 	mutex_init(&osdc->request_mutex);
 	osdc->last_tid = 0;
@@ -2709,10 +2706,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->event_count = 0;
 
 	err = -ENOMEM;
+	osdc->osdmap = ceph_osdmap_alloc();
+	if (!osdc->osdmap)
+		goto out;
+
 	osdc->req_mempool = mempool_create_slab_pool(10,
 						     ceph_osd_request_cache);
 	if (!osdc->req_mempool)
-		goto out;
+		goto out_map;
 
 	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
 				PAGE_SIZE, 10, true, "osd_op");
@@ -2741,6 +2742,8 @@ out_msgpool:
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
 	mempool_destroy(osdc->req_mempool);
+out_map:
+	ceph_osdmap_destroy(osdc->osdmap);
 out:
 	return err;
 }
@@ -2760,10 +2763,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	}
 	mutex_unlock(&osdc->request_mutex);
 
-	if (osdc->osdmap) {
-		ceph_osdmap_destroy(osdc->osdmap);
-		osdc->osdmap = NULL;
-	}
+	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7d4a5b43085e..cde52e94732f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -707,6 +707,23 @@ bad:
 /*
  * osd map
  */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+	struct ceph_osdmap *map;
+
+	map = kzalloc(sizeof(*map), GFP_NOIO);
+	if (!map)
+		return NULL;
+
+	map->pg_pools = RB_ROOT;
+	map->pool_max = -1;
+	map->pg_temp = RB_ROOT;
+	map->primary_temp = RB_ROOT;
+	mutex_init(&map->crush_scratch_mutex);
+
+	return map;
+}
+
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
 	dout("osdmap_destroy %p\n", map);
@@ -1230,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	struct ceph_osdmap *map;
 	int ret;
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
+	map = ceph_osdmap_alloc();
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	map->pg_temp = RB_ROOT;
-	map->primary_temp = RB_ROOT;
-	mutex_init(&map->crush_scratch_mutex);
-
 	ret = osdmap_decode(p, end, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
-- 
cgit v1.2.3


From 42c1b1240326cbea86f15f5d4ce565d8b54be31f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:25 +0200
Subject: libceph: handle_one_map()

Separate osdmap handling from decoding and iterating over a bag of maps
in a fresh MOSDMap message.  This sets up the scene for the updated OSD
client.

Of particular importance here is the addition of pi->was_full, which
can be used to answer "did this pool go full -> not-full in this map?".
This is the key bit for supporting pool quotas.

We won't be able to downgrade map_sem for much longer, so drop
downgrade_write().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |   1 +
 include/linux/ceph/osdmap.h     |   2 +
 net/ceph/mon_client.c           |   8 ++
 net/ceph/osd_client.c           | 186 ++++++++++++++++++++++++++++------------
 4 files changed, 141 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 330d045e4092..c14e9d861cda 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -115,6 +115,7 @@ extern const char *ceph_sub_str[];
 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
 			bool continuous);
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 
 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8468c734d712..821e16fff39a 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -45,6 +45,8 @@ struct ceph_pg_pool_info {
 	s64 write_tier; /* wins for read+write ops */
 	u64 flags; /* CEPH_POOL_FLAG_* */
 	char *name;
+
+	bool was_full;  /* for handle_one_map() */
 };
 
 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a426a4b03e75..98bfbe1f6807 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -376,6 +376,14 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
 
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
+{
+	mutex_lock(&monc->mutex);
+	__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+EXPORT_SYMBOL(ceph_monc_renew_subs);
+
 /*
  * Register interest in the next osdmap
  */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9c35fd84a410..4227c55226c3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1245,6 +1245,21 @@ static bool __pool_full(struct ceph_pg_pool_info *pi)
 	return pi->flags & CEPH_POOL_FLAG_FULL;
 }
 
+static bool have_pool_full(struct ceph_osd_client *osdc)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pi =
+		    rb_entry(n, struct ceph_pg_pool_info, node);
+
+		if (__pool_full(pi))
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
@@ -1639,6 +1654,26 @@ static void __send_queued(struct ceph_osd_client *osdc)
 	}
 }
 
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+	bool continuous = false;
+
+	WARN_ON(!osdc->osdmap->epoch);
+
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+		dout("%s osdc %p continuous\n", __func__, osdc);
+		continuous = true;
+	} else {
+		dout("%s osdc %p onetime\n", __func__, osdc);
+	}
+
+	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+			       osdc->osdmap->epoch + 1, continuous))
+		ceph_monc_renew_subs(&osdc->client->monc);
+}
+
 /*
  * Caller should hold map_sem for read and request_mutex.
  */
@@ -2119,6 +2154,18 @@ out_unlock:
 	up_read(&osdc->map_sem);
 }
 
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pi =
+		    rb_entry(n, struct ceph_pg_pool_info, node);
+
+		pi->was_full = __pool_full(pi);
+	}
+}
+
 static void reset_changed_osds(struct ceph_osd_client *osdc)
 {
 	struct rb_node *p, *n;
@@ -2237,6 +2284,57 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
 	}
 }
 
+static int handle_one_map(struct ceph_osd_client *osdc,
+			  void *p, void *end, bool incremental)
+{
+	struct ceph_osdmap *newmap;
+	struct rb_node *n;
+	bool skipped_map = false;
+	bool was_full;
+
+	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	set_pool_was_full(osdc);
+
+	if (incremental)
+		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+	else
+		newmap = ceph_osdmap_decode(&p, end);
+	if (IS_ERR(newmap))
+		return PTR_ERR(newmap);
+
+	if (newmap != osdc->osdmap) {
+		/*
+		 * Preserve ->was_full before destroying the old map.
+		 * For pools that weren't in the old map, ->was_full
+		 * should be false.
+		 */
+		for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+			struct ceph_pg_pool_info *pi =
+			    rb_entry(n, struct ceph_pg_pool_info, node);
+			struct ceph_pg_pool_info *old_pi;
+
+			old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+			if (old_pi)
+				pi->was_full = old_pi->was_full;
+			else
+				WARN_ON(pi->was_full);
+		}
+
+		if (osdc->osdmap->epoch &&
+		    osdc->osdmap->epoch + 1 < newmap->epoch) {
+			WARN_ON(incremental);
+			skipped_map = true;
+		}
+
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = newmap;
+	}
+
+	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	kick_requests(osdc, skipped_map, was_full);
+
+	return 0;
+}
 
 /*
  * Process updated osd map.
@@ -2247,27 +2345,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
  */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-	void *p, *end, *next;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
 	u32 nr_maps, maplen;
 	u32 epoch;
-	struct ceph_osdmap *newmap = NULL, *oldmap;
-	int err;
 	struct ceph_fsid fsid;
-	bool was_full;
+	bool handled_incremental = false;
+	bool was_pauserd, was_pausewr;
+	bool pauserd, pausewr;
+	int err;
 
-	dout("handle_map have %u\n", osdc->osdmap->epoch);
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+	down_write(&osdc->map_sem);
 
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
 	if (ceph_check_fsid(osdc->client, &fsid) < 0)
-		return;
-
-	down_write(&osdc->map_sem);
+		goto bad;
 
-	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		      have_pool_full(osdc);
 
 	/* incremental maps */
 	ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2277,33 +2377,22 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		epoch = ceph_decode_32(&p);
 		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
-		next = p + maplen;
-		if (osdc->osdmap->epoch+1 == epoch) {
+		if (osdc->osdmap->epoch &&
+		    osdc->osdmap->epoch + 1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
-			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
+			err = handle_one_map(osdc, p, p + maplen, true);
+			if (err)
 				goto bad;
-			}
-			BUG_ON(!newmap);
-			if (newmap != osdc->osdmap) {
-				ceph_osdmap_destroy(osdc->osdmap);
-				osdc->osdmap = newmap;
-			}
-			was_full = was_full ||
-				ceph_osdmap_flag(osdc->osdmap,
-						 CEPH_OSDMAP_FULL);
-			kick_requests(osdc, 0, was_full);
+			handled_incremental = true;
 		} else {
 			dout("ignoring incremental map %u len %d\n",
 			     epoch, maplen);
 		}
-		p = next;
+		p += maplen;
 		nr_maps--;
 	}
-	if (newmap)
+	if (handled_incremental)
 		goto done;
 
 	/* full maps */
@@ -2322,50 +2411,35 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			     "older than our %u\n", epoch, maplen,
 			     osdc->osdmap->epoch);
 		} else {
-			int skipped_map = 0;
-
 			dout("taking full map %u len %d\n", epoch, maplen);
-			newmap = ceph_osdmap_decode(&p, p+maplen);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
+			err = handle_one_map(osdc, p, p + maplen, false);
+			if (err)
 				goto bad;
-			}
-			BUG_ON(!newmap);
-			oldmap = osdc->osdmap;
-			osdc->osdmap = newmap;
-			if (oldmap) {
-				if (oldmap->epoch + 1 < newmap->epoch)
-					skipped_map = 1;
-				ceph_osdmap_destroy(oldmap);
-			}
-			was_full = was_full ||
-				ceph_osdmap_flag(osdc->osdmap,
-						 CEPH_OSDMAP_FULL);
-			kick_requests(osdc, skipped_map, was_full);
 		}
 		p += maplen;
 		nr_maps--;
 	}
 
 done:
-	downgrade_write(&osdc->map_sem);
-	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-			  osdc->osdmap->epoch);
-
 	/*
 	 * subscribe to subsequent osdmap updates if full to ensure
 	 * we find out when we are no longer full and stop returning
 	 * ENOSPC.
 	 */
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		  have_pool_full(osdc);
+	if (was_pauserd || was_pausewr || pauserd || pausewr)
+		maybe_request_map(osdc);
 
 	mutex_lock(&osdc->request_mutex);
 	__send_queued(osdc);
 	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+
+	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+			  osdc->osdmap->epoch);
+	up_write(&osdc->map_sem);
 	wake_up_all(&osdc->client->auth_wq);
 	return;
 
-- 
cgit v1.2.3


From 9dd2845ccb40452d4ac943231ea34aade4a02c68 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:26 +0200
Subject: libceph: protect osdc->osd_lru list with a spinlock

OSD client is getting moved from the big per-client lock to a set of
per-session locks.  The big rwlock would only be held for read most of
the time, so a global osdc->osd_lru needs additional protection.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/osd_client.c           | 29 ++++++++++++++++++-----------
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2415dc0cb008..486d681694c4 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -224,6 +224,7 @@ struct ceph_osd_client {
 	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
+	spinlock_t             osd_lru_lock;
 	u64                    last_tid;      /* tid of last request */
 	struct rb_root         requests;      /* pending requests */
 	struct list_head       req_lru;	      /* in-flight lru */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b6950c2c6cc4..d1c8e06f1261 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1101,31 +1101,37 @@ static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	}
 }
 
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-			      struct ceph_osd *osd)
+static void __move_osd_to_lru(struct ceph_osd *osd)
 {
-	dout("%s %p\n", __func__, osd);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 
+	spin_lock(&osdc->osd_lru_lock);
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	spin_unlock(&osdc->osd_lru_lock);
+
 	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
-				  struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-	dout("%s %p\n", __func__, osd);
-
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests))
-		__move_osd_to_lru(osdc, osd);
+		__move_osd_to_lru(osd);
 }
 
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-	dout("__remove_osd_from_lru %p\n", osd);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	spin_lock(&osdc->osd_lru_lock);
 	if (!list_empty(&osd->o_osd_lru))
 		list_del_init(&osd->o_osd_lru);
+	spin_unlock(&osdc->osd_lru_lock);
 }
 
 /*
@@ -1199,7 +1205,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 		ceph_msg_revoke(req->r_request);
 
 		list_del_init(&req->r_osd_item);
-		maybe_move_osd_to_lru(osdc, req->r_osd);
+		maybe_move_osd_to_lru(req->r_osd);
 		if (list_empty(&req->r_linger_osd_item))
 			req->r_osd = NULL;
 	}
@@ -1248,7 +1254,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 
 	if (req->r_osd) {
 		list_del_init(&req->r_linger_osd_item);
-		maybe_move_osd_to_lru(osdc, req->r_osd);
+		maybe_move_osd_to_lru(req->r_osd);
 		if (list_empty(&req->r_osd_item))
 			req->r_osd = NULL;
 	}
@@ -2792,6 +2798,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->last_tid = 0;
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
+	spin_lock_init(&osdc->osd_lru_lock);
 	osdc->requests = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->req_lru);
 	INIT_LIST_HEAD(&osdc->req_unsent);
-- 
cgit v1.2.3


From 5aea3dcd50215fa9563270251ad7323e2f2490ee Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:26 +0200
Subject: libceph: a major OSD client update

This is a major sync up, up to ~Jewel.  The highlights are:

- per-session request trees (vs a global per-client tree)
- per-session locking (vs a global per-client rwlock)
- homeless OSD session
- no ad-hoc global per-client lists
- support for pool quotas
- foundation for watch/notify v2 support
- foundation for map check (pool deletion detection) support

The switchover is incomplete: lingering requests can be setup and
teared down but aren't ever reestablished.  This functionality is
restored with the introduction of the new lingering infrastructure
(ceph_osd_linger_request, linger_work, etc) in a later commit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c                 |    8 +-
 fs/ceph/xattr.c                 |    8 +-
 include/linux/ceph/osd_client.h |   18 +-
 net/ceph/debugfs.c              |   34 +-
 net/ceph/osd_client.c           | 1164 +++++++++++++++++++--------------------
 5 files changed, 602 insertions(+), 630 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 1831ad6cf066..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	if (copy_from_user(&dl, arg, sizeof(dl)))
 		return -EFAULT;
 
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0) {
-		up_read(&osdc->map_sem);
+		up_read(&osdc->lock);
 		return -EIO;
 	}
 	dl.file_offset -= dl.object_offset;
@@ -217,7 +217,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
 	if (r < 0) {
-		up_read(&osdc->map_sem);
+		up_read(&osdc->lock);
 		return r;
 	}
 
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	} else {
 		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
 	}
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 
 	/* send result back to user */
 	if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9410abdef3ce..5afabc4bf4c7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -75,7 +75,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	char buf[128];
 
 	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name) {
 		size_t len = strlen(pool_name);
@@ -107,7 +107,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				ret = -ERANGE;
 		}
 	}
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 	return ret;
 }
 
@@ -141,13 +141,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
 	const char *pool_name;
 
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name)
 		ret = snprintf(val, size, "%s", pool_name);
 	else
 		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 	return ret;
 }
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 486d681694c4..342f22f1f040 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -33,12 +33,13 @@ struct ceph_osd {
 	int o_incarnation;
 	struct rb_node o_node;
 	struct ceph_connection o_con;
-	struct list_head o_requests;
+	struct rb_root o_requests;
 	struct list_head o_linger_requests;
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
 	struct list_head o_keepalive_item;
+	struct mutex lock;
 };
 
 #define CEPH_OSD_SLAB_OPS	2
@@ -144,8 +145,6 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
-	struct list_head r_req_lru_item;
-	struct list_head r_osd_item;
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
@@ -219,19 +218,16 @@ struct ceph_osd_client {
 	struct ceph_client     *client;
 
 	struct ceph_osdmap     *osdmap;       /* current map */
-	struct rw_semaphore    map_sem;
+	struct rw_semaphore    lock;
 
-	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
-	u64                    last_tid;      /* tid of last request */
-	struct rb_root         requests;      /* pending requests */
-	struct list_head       req_lru;	      /* in-flight lru */
-	struct list_head       req_unsent;    /* unsent/need-resend queue */
-	struct list_head       req_notarget;  /* map to no osd */
 	struct list_head       req_linger;    /* lingering requests */
-	int                    num_requests;
+	struct ceph_osd        homeless_osd;
+	atomic64_t             last_tid;      /* tid of last request */
+	atomic_t               num_requests;
+	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6d3ff713edeb..61dbd9de4650 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -182,21 +182,39 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 	seq_putc(s, '\n');
 }
 
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		dump_request(s, req);
+	}
+
+	mutex_unlock(&osd->lock);
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
 	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
+	struct rb_node *n;
 
-		req = rb_entry(p, struct ceph_osd_request, r_node);
+	down_read(&osdc->lock);
+	seq_printf(s, "REQUESTS %d homeless %d\n",
+		   atomic_read(&osdc->num_requests),
+		   atomic_read(&osdc->num_homeless));
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 
-		dump_request(s, req);
+		dump_requests(s, osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	dump_requests(s, &osdc->homeless_osd);
+
+	up_read(&osdc->lock);
 	return 0;
 }
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d1c8e06f1261..4c856c87b1a9 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -25,16 +25,6 @@ static struct kmem_cache	*ceph_osd_request_cache;
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -53,6 +43,43 @@ static void __enqueue_request(struct ceph_osd_request *req);
  * channel with an OSD is reset.
  */
 
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+	bool wrlocked = true;
+
+	if (unlikely(down_read_trylock(sem))) {
+		wrlocked = false;
+		up_read(sem);
+	}
+
+	return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	WARN_ON(!(mutex_is_locked(&osd->lock) &&
+		  rwsem_is_locked(&osdc->lock)) &&
+		!rwsem_is_wrlocked(&osdc->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+#endif
+
 /*
  * calculate the mapping of a file extent onto an object, and fill out the
  * request accordingly.  shorten extent as necessary if it crosses an
@@ -336,18 +363,14 @@ static void ceph_osdc_release_request(struct kref *kref)
 	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
 	     req->r_request, req->r_reply);
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-	WARN_ON(!list_empty(&req->r_req_lru_item));
-	WARN_ON(!list_empty(&req->r_osd_item));
 	WARN_ON(!list_empty(&req->r_linger_item));
 	WARN_ON(!list_empty(&req->r_linger_osd_item));
 	WARN_ON(req->r_osd);
 
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_reply) {
-		ceph_msg_revoke_incoming(req->r_reply);
+	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	}
 
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
@@ -418,8 +441,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd_item);
-	INIT_LIST_HEAD(&req->r_req_lru_item);
-	INIT_LIST_HEAD(&req->r_osd_item);
 
 	target_init(&req->r_t);
 
@@ -869,141 +890,11 @@ static bool osd_homeless(struct ceph_osd *osd)
 	return osd->o_osd == CEPH_HOMELESS_OSD;
 }
 
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-		    u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid) {
-			if (!n->rb_left)
-				return req;
-			n = n->rb_left;
-		} else if (tid > req->r_tid) {
-			n = n->rb_right;
-		} else {
-			return req;
-		}
-	}
-	return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-	struct ceph_osd *osd = req->r_osd;
-
-	/*
-	 * Linger requests need to be resent with a new tid to avoid
-	 * the dup op detection logic on the OSDs.  Achieve this with
-	 * a re-register dance instead of open-coding.
-	 */
-	ceph_osdc_get_request(req);
-	if (!list_empty(&req->r_linger_item))
-		__unregister_linger_request(osdc, req);
-	else
-		__unregister_request(osdc, req);
-	__register_request(osdc, req);
-	ceph_osdc_put_request(req);
-
-	/*
-	 * Unless request has been registered as both normal and
-	 * lingering, __unregister{,_linger}_request clears r_osd.
-	 * However, here we need to preserve r_osd to make sure we
-	 * requeue on the same OSD.
-	 */
-	WARN_ON(req->r_osd || !osd);
-	req->r_osd = osd;
-
-	dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
-	__enqueue_request(req);
-}
-
-/*
- * Resubmit requests pending on the given osd.
- */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
-				struct ceph_osd *osd)
-{
-	struct ceph_osd_request *req, *nreq;
-	LIST_HEAD(resend);
-	LIST_HEAD(resend_linger);
-	int err;
-
-	dout("%s osd%d\n", __func__, osd->o_osd);
-	err = __reset_osd(osdc, osd);
-	if (err)
-		return;
-
-	/*
-	 * Build up a list of requests to resend by traversing the
-	 * osd's list of requests.  Requests for a given object are
-	 * sent in tid order, and that is also the order they're
-	 * kept on this list.  Therefore all requests that are in
-	 * flight will be found first, followed by all requests that
-	 * have not yet been sent.  And to resend requests while
-	 * preserving this order we will want to put any sent
-	 * requests back on the front of the osd client's unsent
-	 * list.
-	 *
-	 * So we build a separate ordered list of already-sent
-	 * requests for the affected osd and splice it onto the
-	 * front of the osd client's unsent list.  Once we've seen a
-	 * request that has not yet been sent we're done.  Those
-	 * requests are already sitting right where they belong.
-	 */
-	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-		if (!req->r_sent)
-			break;
-
-		if (!req->r_linger) {
-			dout("%s requeueing %p tid %llu\n", __func__, req,
-			     req->r_tid);
-			list_move_tail(&req->r_req_lru_item, &resend);
-			req->r_flags |= CEPH_OSD_FLAG_RETRY;
-		} else {
-			list_move_tail(&req->r_req_lru_item, &resend_linger);
-		}
-	}
-	list_splice(&resend, &osdc->req_unsent);
-
-	/*
-	 * Both registered and not yet registered linger requests are
-	 * enqueued with a new tid on the same OSD.  We add/move them
-	 * to req_unsent/o_requests at the end to keep things in tid
-	 * order.
-	 */
-	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-				 r_linger_osd_item) {
-		WARN_ON(!list_empty(&req->r_req_lru_item));
-		__kick_linger_request(req);
-	}
-
-	list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-		__kick_linger_request(req);
-}
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static bool osd_registered(struct ceph_osd *osd)
 {
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	verify_osdc_locked(osd->o_osdc);
 
-	if (!osd)
-		return;
-	dout("osd_reset osd%d\n", osd->o_osd);
-	osdc = osd->o_osdc;
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	__kick_osd_requests(osdc, osd);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	return !RB_EMPTY_NODE(&osd->o_node);
 }
 
 /*
@@ -1013,17 +904,18 @@ static void osd_init(struct ceph_osd *osd)
 {
 	atomic_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
-	INIT_LIST_HEAD(&osd->o_requests);
+	osd->o_requests = RB_ROOT;
 	INIT_LIST_HEAD(&osd->o_linger_requests);
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	osd->o_incarnation = 1;
+	mutex_init(&osd->lock);
 }
 
 static void osd_cleanup(struct ceph_osd *osd)
 {
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
-	WARN_ON(!list_empty(&osd->o_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
 	WARN_ON(!list_empty(&osd->o_linger_requests));
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
@@ -1077,30 +969,6 @@ static void put_osd(struct ceph_osd *osd)
 
 DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
 
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-	WARN_ON(!list_empty(&osd->o_requests));
-	WARN_ON(!list_empty(&osd->o_linger_requests));
-
-	list_del_init(&osd->o_osd_lru);
-	erase_osd(&osdc->osds, osd);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
-	if (!RB_EMPTY_NODE(&osd->o_node)) {
-		ceph_con_close(&osd->o_con);
-		__remove_osd(osdc, osd);
-		put_osd(osd);
-	}
-}
-
 static void __move_osd_to_lru(struct ceph_osd *osd)
 {
 	struct ceph_osd_client *osdc = osd->o_osdc;
@@ -1117,7 +985,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd)
 
 static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-	if (list_empty(&osd->o_requests) &&
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests))
 		__move_osd_to_lru(osd);
 }
@@ -1134,30 +1002,64 @@ static void __remove_osd_from_lru(struct ceph_osd *osd)
 	spin_unlock(&osdc->osd_lru_lock);
 }
 
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+
+	verify_osdc_wrlocked(osdc);
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	ceph_con_close(&osd->o_con);
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+		unlink_request(osd, req);
+		link_request(&osdc->homeless_osd, req);
+	}
+
+	__remove_osd_from_lru(osd);
+	erase_osd(&osdc->osds, osd);
+	put_osd(osd);
+}
+
 /*
  * reset osd connect
  */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
 	struct ceph_entity_addr *peer_addr;
 
-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-	if (list_empty(&osd->o_requests) &&
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
-		remove_osd(osdc, osd);
+		close_osd(osd);
 		return -ENODEV;
 	}
 
-	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
 	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
 			!ceph_con_opened(&osd->o_con)) {
-		struct ceph_osd_request *req;
+		struct rb_node *n;
 
 		dout("osd addr hasn't changed and connection never opened, "
 		     "letting msgr retry\n");
 		/* touch each r_stamp for handle_timeout()'s benfit */
-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+		for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+			struct ceph_osd_request *req =
+			    rb_entry(n, struct ceph_osd_request, r_node);
 			req->r_stamp = jiffies;
+		}
 
 		return -EAGAIN;
 	}
@@ -1169,73 +1071,84 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req)
-{
-	req->r_tid = ++osdc->last_tid;
-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	insert_request(&osdc->requests, req);
-	ceph_osdc_get_request(req);
-	osdc->num_requests++;
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+					  bool wrlocked)
 {
-	if (RB_EMPTY_NODE(&req->r_node)) {
-		dout("__unregister_request %p tid %lld not registered\n",
-			req, req->r_tid);
-		return;
-	}
+	struct ceph_osd *osd;
 
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	erase_request(&osdc->requests, req);
-	osdc->num_requests--;
+	if (wrlocked)
+		verify_osdc_wrlocked(osdc);
+	else
+		verify_osdc_locked(osdc);
 
-	if (req->r_osd) {
-		/* make sure the original request isn't in flight. */
-		ceph_msg_revoke(req->r_request);
+	if (o != CEPH_HOMELESS_OSD)
+		osd = lookup_osd(&osdc->osds, o);
+	else
+		osd = &osdc->homeless_osd;
+	if (!osd) {
+		if (!wrlocked)
+			return ERR_PTR(-EAGAIN);
 
-		list_del_init(&req->r_osd_item);
-		maybe_move_osd_to_lru(req->r_osd);
-		if (list_empty(&req->r_linger_osd_item))
-			req->r_osd = NULL;
+		osd = create_osd(osdc, o);
+		insert_osd(&osdc->osds, osd);
+		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+			      &osdc->osdmap->osd_addr[osd->o_osd]);
 	}
 
-	list_del_init(&req->r_req_lru_item);
-	ceph_osdc_put_request(req);
+	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+	return osd;
 }
 
 /*
- * Cancel a previously queued request message
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
  */
-static void __cancel_request(struct ceph_osd_request *req)
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-	if (req->r_sent && req->r_osd) {
-		ceph_msg_revoke(req->r_request);
-		req->r_sent = 0;
-	}
+	verify_osd_locked(osd);
+	WARN_ON(!req->r_tid || req->r_osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_request(&osd->o_requests, req);
+	req->r_osd = osd;
 }
 
-static void __register_linger_request(struct ceph_osd_client *osdc,
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
+{
+	verify_osd_locked(osd);
+	WARN_ON(req->r_osd != osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	req->r_osd = NULL;
+	erase_request(&osd->o_requests, req);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static void __register_linger_request(struct ceph_osd *osd,
 				    struct ceph_osd_request *req)
 {
 	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
 	WARN_ON(!req->r_linger);
 
 	ceph_osdc_get_request(req);
-	list_add_tail(&req->r_linger_item, &osdc->req_linger);
-	if (req->r_osd)
-		list_add_tail(&req->r_linger_osd_item,
-			      &req->r_osd->o_linger_requests);
+	list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger);
+	list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests);
+	__remove_osd_from_lru(osd);
+	req->r_osd = osd;
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -1255,7 +1168,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 	if (req->r_osd) {
 		list_del_init(&req->r_linger_osd_item);
 		maybe_move_osd_to_lru(req->r_osd);
-		if (list_empty(&req->r_osd_item))
+		if (RB_EMPTY_ROOT(&req->r_osd->o_requests))
 			req->r_osd = NULL;
 	}
 	ceph_osdc_put_request(req);
@@ -1291,11 +1204,20 @@ static bool have_pool_full(struct ceph_osd_client *osdc)
 	return false;
 }
 
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+	struct ceph_pg_pool_info *pi;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
+
+	return __pool_full(pi);
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
  */
 static bool target_should_be_paused(struct ceph_osd_client *osdc,
 				    const struct ceph_osd_request_target *t,
@@ -1421,87 +1343,6 @@ out:
 	return ct_res;
 }
 
-static void __enqueue_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-
-	dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	if (req->r_osd) {
-		__remove_osd_from_lru(req->r_osd);
-		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
-	} else {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
-	}
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-			 struct ceph_osd_request *req, int force_resend)
-{
-	enum calc_target_result ct_res;
-	int err;
-
-	dout("map_request %p tid %lld\n", req, req->r_tid);
-
-	ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
-	switch (ct_res) {
-	case CALC_TARGET_POOL_DNE:
-		list_move(&req->r_req_lru_item, &osdc->req_notarget);
-		return -EIO;
-	case CALC_TARGET_NO_ACTION:
-		return 0;  /* no change */
-	default:
-		BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
-	}
-
-	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	if (req->r_osd) {
-		__cancel_request(req);
-		list_del_init(&req->r_osd_item);
-		list_del_init(&req->r_linger_osd_item);
-		req->r_osd = NULL;
-	}
-
-	req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
-	if (!req->r_osd && req->r_t.osd >= 0) {
-		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, req->r_t.osd);
-		if (!req->r_osd) {
-			list_move(&req->r_req_lru_item, &osdc->req_notarget);
-			goto out;
-		}
-
-		dout("map_request osd %p is osd%d\n", req->r_osd,
-		     req->r_osd->o_osd);
-		insert_osd(&osdc->osds, req->r_osd);
-
-		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
-			      &osdc->osdmap->osd_addr[req->r_osd->o_osd]);
-	}
-
-	__enqueue_request(req);
-	err = 1;   /* osd or pg changed */
-
-out:
-	return err;
-}
-
 static void setup_request_data(struct ceph_osd_request *req,
 			       struct ceph_msg *msg)
 {
@@ -1648,8 +1489,16 @@ static void send_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd *osd = req->r_osd;
 
+	verify_osd_locked(osd);
 	WARN_ON(osd->o_osd != req->r_t.osd);
 
+	/*
+	 * We may have a previously queued request message hanging
+	 * around.  Cancel it to avoid corrupting the msgr.
+	 */
+	if (req->r_sent)
+		ceph_msg_revoke(req->r_request);
+
 	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
 	if (req->r_attempts)
 		req->r_flags |= CEPH_OSD_FLAG_RETRY;
@@ -1671,24 +1520,11 @@ static void send_request(struct ceph_osd_request *req)
 	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
 }
 
-/*
- * Send any requests in the queue (req_unsent).
- */
-static void __send_queued(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd_request *req, *tmp;
-
-	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-		send_request(req);
-	}
-}
-
 static void maybe_request_map(struct ceph_osd_client *osdc)
 {
 	bool continuous = false;
 
+	verify_osdc_locked(osdc);
 	WARN_ON(!osdc->osdmap->epoch);
 
 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
@@ -1705,38 +1541,121 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				     struct ceph_osd_request *req,
-				     bool nofail)
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-	int rc;
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd;
+	bool need_send = false;
+	bool promoted = false;
 
-	__register_request(osdc, req);
-	req->r_sent = 0;
-	req->r_got_reply = 0;
-	rc = __map_request(osdc, req, 0);
-	if (rc < 0) {
-		if (nofail) {
-			dout("osdc_start_request failed map, "
-				" will retry %lld\n", req->r_tid);
-			rc = 0;
-		} else {
-			__unregister_request(osdc, req);
-		}
-		return rc;
+	WARN_ON(req->r_tid || req->r_got_reply);
+	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+	calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+	if (IS_ERR(osd)) {
+		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+		goto promote;
 	}
 
-	if (req->r_osd == NULL) {
-		dout("send_request %p no up osds in pg\n", req);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+		dout("req %p pausewr\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+		   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+		dout("req %p pauserd\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+		   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+				     CEPH_OSD_FLAG_FULL_FORCE)) &&
+		   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		    pool_full(osdc, req->r_t.base_oloc.pool))) {
+		dout("req %p full/pool_full\n", req);
+		pr_warn_ratelimited("FULL or reached pool quota\n");
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if (!osd_homeless(osd)) {
+		need_send = true;
 	} else {
-		__send_queued(osdc);
+		maybe_request_map(osdc);
 	}
 
-	return 0;
+	mutex_lock(&osd->lock);
+	/*
+	 * Assign the tid atomically with send_request() to protect
+	 * multiple writes to the same object from racing with each
+	 * other, resulting in out of order ops on the OSDs.
+	 */
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(osd, req);
+	if (need_send)
+		send_request(req);
+	mutex_unlock(&osd->lock);
+
+	if (promoted)
+		downgrade_write(&osdc->lock);
+	return;
+
+promote:
+	up_read(&osdc->lock);
+	down_write(&osdc->lock);
+	wrlocked = true;
+	promoted = true;
+	goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+	if (req->r_flags & CEPH_OSD_FLAG_READ) {
+		WARN_ON(req->r_flags & mask);
+		req->r_flags |= CEPH_OSD_FLAG_ACK;
+	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		WARN_ON(!(req->r_flags & mask));
+	else
+		WARN_ON(1);
+
+	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+	atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
+	ceph_osdc_get_request(req);
+	account_request(req);
+	__submit_request(req, wrlocked);
+}
+
+static void __finish_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd = req->r_osd;
+
+	verify_osd_locked(osd);
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	unlink_request(osd, req);
+	atomic_dec(&osdc->num_requests);
+
+	/*
+	 * If an OSD has failed or returned and a request has been sent
+	 * twice, it's possible to get a reply and end up here while the
+	 * request message is queued for delivery.  We will ignore the
+	 * reply, so not a big deal, but better to try and catch it.
+	 */
+	ceph_msg_revoke(req->r_request);
+	ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+	__finish_request(req);
+	ceph_osdc_put_request(req);
 }
 
 static void __complete_request(struct ceph_osd_request *req)
@@ -1747,6 +1666,13 @@ static void __complete_request(struct ceph_osd_request *req)
 		complete_all(&req->r_completion);
 }
 
+static void cancel_request(struct ceph_osd_request *req)
+{
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	finish_request(req);
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -1758,44 +1684,49 @@ static void handle_timeout(struct work_struct *work)
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
-	struct ceph_osd_request *req;
-	struct ceph_osd *osd;
-	struct list_head slow_osds;
-	dout("timeout\n");
-	down_read(&osdc->map_sem);
-
-	ceph_monc_request_next_osdmap(&osdc->client->monc);
+	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	LIST_HEAD(slow_osds);
+	struct rb_node *n, *p;
 
-	mutex_lock(&osdc->request_mutex);
+	dout("%s osdc %p\n", __func__, osdc);
+	down_write(&osdc->lock);
 
 	/*
 	 * ping osds that are a bit slow.  this ensures that if there
 	 * is a break in the TCP connection we will notice, and reopen
 	 * a connection with that osd (from the fault callback).
 	 */
-	INIT_LIST_HEAD(&slow_osds);
-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies,
-				req->r_stamp + opts->osd_keepalive_timeout))
-			break;
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+		bool found = false;
+
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (time_before(req->r_stamp, cutoff)) {
+				dout(" req %p tid %llu on osd%d is laggy\n",
+				     req, req->r_tid, osd->o_osd);
+				found = true;
+			}
+		}
 
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
-		     req->r_tid, osd->o_osd);
-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+		if (found)
+			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
+
+	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+		maybe_request_map(osdc);
+
 	while (!list_empty(&slow_osds)) {
-		osd = list_entry(slow_osds.next, struct ceph_osd,
-				 o_keepalive_item);
+		struct ceph_osd *osd = list_first_entry(&slow_osds,
+							struct ceph_osd,
+							o_keepalive_item);
 		list_del_init(&osd->o_keepalive_item);
 		ceph_con_keepalive(&osd->o_con);
 	}
 
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->timeout_work,
 			      osdc->client->options->osd_keepalive_timeout);
 }
@@ -1809,18 +1740,17 @@ static void handle_osds_timeout(struct work_struct *work)
 	struct ceph_osd *osd, *nosd;
 
 	dout("%s osdc %p\n", __func__, osdc);
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-
+	down_write(&osdc->lock);
 	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
 		if (time_before(jiffies, osd->lru_ttl))
 			break;
 
-		remove_osd(osdc, osd);
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+		WARN_ON(!list_empty(&osd->o_linger_requests));
+		close_osd(osd);
 	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->osds_timeout_work,
 			      round_jiffies_relative(delay));
 }
@@ -2045,8 +1975,9 @@ static bool done_request(const struct ceph_osd_request *req,
  * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
  *				r_safe_completion	r_safe_completion
  */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	struct ceph_osd_request *req;
 	struct MOSDOpReply m;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
@@ -2057,14 +1988,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
-		dout("%s no tid %llu\n", __func__, tid);
-		goto out_unlock;
+		dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+		goto out_unlock_session;
 	}
-	ceph_osdc_get_request(req);
 
 	ret = decode_MOSDOpReply(msg, &m);
 	if (ret) {
@@ -2083,7 +2019,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
 			     req, req->r_tid, m.retry_attempt,
 			     req->r_attempts - 1);
-			goto out_put;
+			goto out_unlock_session;
 		}
 	} else {
 		WARN_ON(1); /* MOSDOpReply v4 is assumed */
@@ -2092,22 +2028,14 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	if (!ceph_oloc_empty(&m.redirect.oloc)) {
 		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
 		     m.redirect.oloc.pool);
-		__unregister_request(osdc, req);
+		unlink_request(osd, req);
+		mutex_unlock(&osd->lock);
 
 		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
-
-		/*
-		 * Start redirect requests with nofail=true.  If
-		 * mapping fails, request will end up on the notarget
-		 * list, waiting for the new osdmap (which can take
-		 * a while), even though the original request mapped
-		 * successfully.  In the future we might want to follow
-		 * original request's nofail setting here.
-		 */
-		ret = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(ret);
-
-		goto out_put;
+		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+		req->r_tid = 0;
+		__submit_request(req, false);
+		goto out_unlock_osdc;
 	}
 
 	if (m.num_ops != req->r_num_ops) {
@@ -2137,19 +2065,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		req->r_got_reply = true;
 	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
 		dout("req %p tid %llu dup ack\n", req, req->r_tid);
-		goto out_put;
+		goto out_unlock_session;
 	}
 
 	if (done_request(req, &m)) {
-		__unregister_request(osdc, req);
+		__finish_request(req);
 		if (req->r_linger) {
 			WARN_ON(req->r_unsafe_callback);
-			__register_linger_request(osdc, req);
+			__register_linger_request(osd, req);
 		}
 	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	mutex_unlock(&osd->lock);
+	up_read(&osdc->lock);
 
 	if (done_request(req, &m)) {
 		if (already_acked && req->r_unsafe_callback) {
@@ -2175,14 +2103,13 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 fail_request:
 	req->r_result = -EIO;
-	__unregister_request(osdc, req);
+	__finish_request(req);
 	__complete_request(req);
 	complete_all(&req->r_safe_completion);
-out_put:
-	ceph_osdc_put_request(req);
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 }
 
 static void set_pool_was_full(struct ceph_osd_client *osdc)
@@ -2197,126 +2124,66 @@ static void set_pool_was_full(struct ceph_osd_client *osdc)
 	}
 }
 
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
 {
-	struct rb_node *p, *n;
+	struct ceph_pg_pool_info *pi;
 
-	dout("%s %p\n", __func__, osdc);
-	for (p = rb_first(&osdc->osds); p; p = n) {
-		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
 
-		n = rb_next(p);
-		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-		    memcmp(&osd->o_con.peer_addr,
-			   ceph_osd_addr(osdc->osdmap,
-					 osd->o_osd),
-			   sizeof(struct ceph_entity_addr)) != 0)
-			__reset_osd(osdc, osd);
-	}
+	return pi->was_full && !__pool_full(pi);
 }
 
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
  */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
-			  bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+			  bool force_resend,
+			  bool cleared_full,
+			  bool check_pool_cleared_full,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
-	struct ceph_osd_request *req, *nreq;
-	struct rb_node *p;
-	int needmap = 0;
-	int err;
-	bool force_resend_req;
-
-	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
-		force_resend_writes ? " (force resend writes)" : "");
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; ) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-		p = rb_next(p);
-
-		/*
-		 * For linger requests that have not yet been
-		 * registered, move them to the linger list; they'll
-		 * be sent to the osd in the loop below.  Unregister
-		 * the request before re-registering it as a linger
-		 * request to ensure the __map_request() below
-		 * will decide it needs to be sent.
-		 */
-		if (req->r_linger && list_empty(&req->r_linger_item)) {
-			dout("%p tid %llu restart on osd%d\n",
-			     req, req->r_tid,
-			     req->r_osd ? req->r_osd->o_osd : -1);
-			ceph_osdc_get_request(req);
-			__unregister_request(osdc, req);
-			__register_linger_request(osdc, req);
-			ceph_osdc_put_request(req);
-			continue;
-		}
-
-		force_resend_req = force_resend ||
-			(force_resend_writes &&
-				req->r_flags & CEPH_OSD_FLAG_WRITE);
-		err = __map_request(osdc, req, force_resend_req);
-		if (err < 0)
-			continue;  /* error */
-		if (req->r_osd == NULL) {
-			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-			needmap++;  /* request a newer map */
-		} else if (err > 0) {
-			if (!req->r_linger) {
-				dout("%p tid %llu requeued on osd%d\n", req,
-				     req->r_tid,
-				     req->r_osd ? req->r_osd->o_osd : -1);
-				req->r_flags |= CEPH_OSD_FLAG_RETRY;
-			}
-		}
-	}
-
-	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
-				 r_linger_item) {
-		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
-		err = __map_request(osdc, req,
-				    force_resend || force_resend_writes);
-		dout("__map_request returned %d\n", err);
-		if (err < 0)
-			continue;  /* hrm! */
-		if (req->r_osd == NULL || err > 0) {
-			if (req->r_osd == NULL) {
-				dout("lingering %p tid %llu maps to no osd\n",
-				     req, req->r_tid);
-				/*
-				 * A homeless lingering request makes
-				 * no sense, as it's job is to keep
-				 * a particular OSD connection open.
-				 * Request a newer map and kick the
-				 * request, knowing that it won't be
-				 * resent until we actually get a map
-				 * that can tell us where to send it.
-				 */
-				needmap++;
-			}
-
-			dout("kicking lingering %p tid %llu osd%d\n", req,
-			     req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
-			__register_request(osdc, req);
-			__unregister_linger_request(osdc, req);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+	bool force_resend_writes;
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+		ct_res = calc_target(osdc, &req->r_t,
+				     &req->r_last_force_resend, false);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+			if (!force_resend &&
+			    (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+			     !force_resend_writes))
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			unlink_request(osd, req);
+			insert_request(need_resend, req);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			break;
 		}
 	}
-	reset_changed_osds(osdc);
-	mutex_unlock(&osdc->request_mutex);
-
-	if (needmap) {
-		dout("%d requests for down osds, need new map\n", needmap);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-	}
 }
 
 static int handle_one_map(struct ceph_osd_client *osdc,
-			  void *p, void *end, bool incremental)
+			  void *p, void *end, bool incremental,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
 	struct ceph_osdmap *newmap;
 	struct rb_node *n;
@@ -2362,11 +2229,51 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	}
 
 	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-	kick_requests(osdc, skipped_map, was_full);
+	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+		      need_resend, need_resend_linger);
+
+	for (n = rb_first(&osdc->osds); n; ) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		n = rb_next(n); /* close_osd() */
+
+		scan_requests(osd, skipped_map, was_full, true, need_resend,
+			      need_resend_linger);
+		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+		    memcmp(&osd->o_con.peer_addr,
+			   ceph_osd_addr(osdc->osdmap, osd->o_osd),
+			   sizeof(struct ceph_entity_addr)))
+			close_osd(osd);
+	}
 
 	return 0;
 }
 
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(need_resend); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		struct ceph_osd *osd;
+
+		n = rb_next(n);
+		erase_request(need_resend, req); /* before link_request() */
+
+		WARN_ON(req->r_osd);
+		calc_target(osdc, &req->r_t, NULL, false);
+		osd = lookup_create_osd(osdc, req->r_t.osd, true);
+		link_request(osd, req);
+		if (!req->r_linger) {
+			if (!osd_homeless(osd) && !req->r_t.paused)
+				send_request(req);
+		}
+	}
+}
+
 /*
  * Process updated osd map.
  *
@@ -2381,13 +2288,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	u32 nr_maps, maplen;
 	u32 epoch;
 	struct ceph_fsid fsid;
+	struct rb_root need_resend = RB_ROOT;
+	LIST_HEAD(need_resend_linger);
 	bool handled_incremental = false;
 	bool was_pauserd, was_pausewr;
 	bool pauserd, pausewr;
 	int err;
 
 	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
-	down_write(&osdc->map_sem);
+	down_write(&osdc->lock);
 
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
@@ -2412,7 +2321,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		    osdc->osdmap->epoch + 1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
-			err = handle_one_map(osdc, p, p + maplen, true);
+			err = handle_one_map(osdc, p, p + maplen, true,
+					     &need_resend, &need_resend_linger);
 			if (err)
 				goto bad;
 			handled_incremental = true;
@@ -2443,7 +2353,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			     osdc->osdmap->epoch);
 		} else {
 			dout("taking full map %u len %d\n", epoch, maplen);
-			err = handle_one_map(osdc, p, p + maplen, false);
+			err = handle_one_map(osdc, p, p + maplen, false,
+					     &need_resend, &need_resend_linger);
 			if (err)
 				goto bad;
 		}
@@ -2464,20 +2375,60 @@ done:
 	if (was_pauserd || was_pausewr || pauserd || pausewr)
 		maybe_request_map(osdc);
 
-	mutex_lock(&osdc->request_mutex);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
+	kick_requests(osdc, &need_resend, &need_resend_linger);
 
 	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
 			  osdc->osdmap->epoch);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
 	wake_up_all(&osdc->client->auth_wq);
 	return;
 
 bad:
 	pr_err("osdc handle_map corrupt msg\n");
 	ceph_msg_dump(msg);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void kick_osd_requests(struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		if (!req->r_linger) {
+			if (!req->r_t.paused)
+				send_request(req);
+		}
+	}
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	down_write(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock;
+	}
+
+	if (!reopen_osd(osd))
+		kick_osd_requests(osd);
+	maybe_request_map(osdc);
+
+out_unlock:
+	up_write(&osdc->lock);
 }
 
 /*
@@ -2680,17 +2631,11 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 			    struct ceph_osd_request *req,
 			    bool nofail)
 {
-	int rc;
-
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-
-	rc = __ceph_osdc_start_request(osdc, req, nofail);
+	down_read(&osdc->lock);
+	submit_request(req, false);
+	up_read(&osdc->lock);
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
@@ -2703,13 +2648,12 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
 
-	mutex_lock(&osdc->request_mutex);
+	down_write(&osdc->lock);
 	if (req->r_linger)
 		__unregister_linger_request(osdc, req);
-	__unregister_request(osdc, req);
-	mutex_unlock(&osdc->request_mutex);
-
-	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+	if (req->r_osd)
+		cancel_request(req);
+	up_write(&osdc->lock);
 }
 EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
@@ -2744,32 +2688,40 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
  */
 void ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
-	struct ceph_osd_request *req;
-	u64 last_tid, next_tid = 0;
+	struct rb_node *n, *p;
+	u64 last_tid = atomic64_read(&osdc->last_tid);
 
-	mutex_lock(&osdc->request_mutex);
-	last_tid = osdc->last_tid;
-	while (1) {
-		req = __lookup_request_ge(osdc, next_tid);
-		if (!req)
-			break;
-		if (req->r_tid > last_tid)
-			break;
+again:
+	down_read(&osdc->lock);
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		mutex_lock(&osd->lock);
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (req->r_tid > last_tid)
+				break;
+
+			if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+				continue;
 
-		next_tid = req->r_tid + 1;
-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-			continue;
+			ceph_osdc_get_request(req);
+			mutex_unlock(&osd->lock);
+			up_read(&osdc->lock);
+			dout("%s waiting on req %p tid %llu last_tid %llu\n",
+			     __func__, req, req->r_tid, last_tid);
+			wait_for_completion(&req->r_safe_completion);
+			ceph_osdc_put_request(req);
+			goto again;
+		}
 
-		ceph_osdc_get_request(req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("sync waiting on tid %llu (last is %llu)\n",
-		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&osdc->request_mutex);
-		ceph_osdc_put_request(req);
+		mutex_unlock(&osd->lock);
 	}
-	mutex_unlock(&osdc->request_mutex);
-	dout("sync done (thru tid %llu)\n", last_tid);
+
+	up_read(&osdc->lock);
+	dout("%s done last_tid %llu\n", __func__, last_tid);
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
@@ -2793,18 +2745,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	dout("init\n");
 	osdc->client = client;
-	init_rwsem(&osdc->map_sem);
-	mutex_init(&osdc->request_mutex);
-	osdc->last_tid = 0;
+	init_rwsem(&osdc->lock);
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
 	spin_lock_init(&osdc->osd_lru_lock);
-	osdc->requests = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->req_lru);
-	INIT_LIST_HEAD(&osdc->req_unsent);
-	INIT_LIST_HEAD(&osdc->req_notarget);
 	INIT_LIST_HEAD(&osdc->req_linger);
-	osdc->num_requests = 0;
+	osd_init(&osdc->homeless_osd);
+	osdc->homeless_osd.o_osdc = osdc;
+	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
 	spin_lock_init(&osdc->event_lock);
@@ -2861,13 +2809,19 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	cancel_delayed_work_sync(&osdc->timeout_work);
 	cancel_delayed_work_sync(&osdc->osds_timeout_work);
 
-	mutex_lock(&osdc->request_mutex);
+	down_write(&osdc->lock);
 	while (!RB_EMPTY_ROOT(&osdc->osds)) {
 		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
 						struct ceph_osd, o_node);
-		remove_osd(osdc, osd);
+		close_osd(osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	up_write(&osdc->lock);
+	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+	osd_cleanup(&osdc->homeless_osd);
+
+	WARN_ON(!list_empty(&osdc->osd_lru));
+	WARN_ON(atomic_read(&osdc->num_requests));
+	WARN_ON(atomic_read(&osdc->num_homeless));
 
 	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
@@ -2982,19 +2936,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(msg->hdr.type);
 
-	if (!osd)
-		goto out;
-	osdc = osd->o_osdc;
-
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 		ceph_osdc_handle_map(osdc, msg);
 		break;
 	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg);
+		handle_reply(osd, msg);
 		break;
 	case CEPH_MSG_WATCH_NOTIFY:
 		handle_watch_notify(osdc, msg);
@@ -3004,7 +2954,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
-out:
+
 	ceph_msg_put(msg);
 }
 
@@ -3019,21 +2969,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
-	struct ceph_msg *m;
+	struct ceph_msg *m = NULL;
 	struct ceph_osd_request *req;
 	int front_len = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
-	u64 tid;
+	u64 tid = le64_to_cpu(hdr->tid);
 
-	tid = le64_to_cpu(hdr->tid);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+		*skip = 1;
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	ceph_msg_revoke_incoming(req->r_reply);
@@ -3045,7 +3001,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
 				 false);
 		if (!m)
-			goto out;
+			goto out_unlock_session;
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
 	}
@@ -3056,14 +3012,16 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 			req->r_reply->data_length);
 		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	m = ceph_msg_get(req->r_reply);
 	dout("get_reply tid %lld %p\n", tid, m);
 
-out:
-	mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 	return m;
 }
 
@@ -3083,8 +3041,8 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-			osd->o_osd);
+		pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+			osd->o_osd, type);
 		*skip = 1;
 		return NULL;
 	}
@@ -3188,5 +3146,5 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.alloc_msg = alloc_msg,
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
-	.fault = osd_reset,
+	.fault = osd_fault,
 };
-- 
cgit v1.2.3


From 922dab6134178cae317ae00de86376cba59f3147 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 01:15:02 +0200
Subject: libceph, rbd: ceph_osd_linger_request, watch/notify v2

This adds support and switches rbd to a new, more reliable version of
watch/notify protocol.  As with the OSD client update, this is mostly
about getting the right structures linked into the right places so that
reconnects are properly sent when needed.  watch/notify v2 also
requires sending regular pings to the OSDs - send_linger_ping().

A major change from the old watch/notify implementation is the
introduction of ceph_osd_linger_request - linger requests no longer
piggy back on ceph_osd_request.  ceph_osd_event has been merged into
ceph_osd_linger_request.

All the details are now hidden within libceph, the interface consists
of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack().
ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep
the lifetime management simple.

ceph_osdc_notify_ack() accepts an optional data payload, which is
relayed back to the notifier.

Portions of this patch are loosely based on work by Douglas Fuller
<dfuller@redhat.com> and Mike Christie <michaelc@cs.wisc.edu>.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  179 ++----
 include/linux/ceph/ceph_fs.h    |    5 +-
 include/linux/ceph/osd_client.h |   97 ++--
 include/linux/ceph/rados.h      |   17 +-
 net/ceph/ceph_strings.c         |   16 +
 net/ceph/debugfs.c              |   36 ++
 net/ceph/osd_client.c           | 1148 ++++++++++++++++++++++++++++++---------
 7 files changed, 1067 insertions(+), 431 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fce23dc908e3..d0834c477f96 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -351,11 +351,11 @@ struct rbd_device {
 	struct rbd_options	*opts;
 
 	struct ceph_object_id	header_oid;
+	struct ceph_object_locator header_oloc;
 
 	struct ceph_file_layout	layout;
 
-	struct ceph_osd_event   *watch_event;
-	struct rbd_obj_request	*watch_request;
+	struct ceph_osd_linger_request *watch_handle;
 
 	struct rbd_spec		*parent_spec;
 	u64			parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
 	return __rbd_obj_request_wait(obj_request, 0);
 }
 
-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
-					unsigned long timeout)
-{
-	return __rbd_obj_request_wait(obj_request, timeout);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
-{
-	dout("%s: obj %p\n", __func__, obj_request);
-	obj_request_done_set(obj_request);
-}
-
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
 	struct rbd_img_request *img_request = NULL;
@@ -1877,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 	case CEPH_OSD_OP_CALL:
 		rbd_osd_call_callback(obj_request);
 		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
-	case CEPH_OSD_OP_WATCH:
-		rbd_osd_trivial_callback(obj_request);
-		break;
 	default:
 		rbd_warn(NULL, "%s: unsupported op %hu",
 			obj_request->object_name, (unsigned short) opcode);
@@ -3100,45 +3084,18 @@ out_err:
 	obj_request_done_set(obj_request);
 }
 
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
-{
-	struct rbd_obj_request *obj_request;
-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	int ret;
-
-	obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
-							OBJ_REQUEST_NODATA);
-	if (!obj_request)
-		return -ENOMEM;
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
 
-	ret = -ENOMEM;
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-						  obj_request);
-	if (!obj_request->osd_req)
-		goto out;
-
-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
-					notify_id, 0, 0);
-	rbd_osd_req_format_read(obj_request);
-
-	ret = rbd_obj_request_submit(osdc, obj_request);
-	if (ret)
-		goto out;
-	ret = rbd_obj_request_wait(obj_request);
-out:
-	rbd_obj_request_put(obj_request);
-
-	return ret;
-}
-
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
+			 u64 notifier_id, void *data, size_t data_len)
 {
-	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	struct rbd_device *rbd_dev = arg;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	int ret;
 
-	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
-		rbd_dev->header_oid.name, (unsigned long long)notify_id,
-		(unsigned int)opcode);
+	dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
+	     cookie, notify_id);
 
 	/*
 	 * Until adequate refresh error handling is in place, there is
@@ -3150,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (ret)
 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
 
-	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+				   &rbd_dev->header_oloc, notify_id, cookie,
+				   NULL, 0);
 	if (ret)
 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
 
-/*
- * Send a (un)watch request and wait for the ack.  Return a request
- * with a ref held on success or error.
- */
-static struct rbd_obj_request *rbd_obj_watch_request_helper(
-						struct rbd_device *rbd_dev,
-						bool watch)
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
 {
-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	struct ceph_options *opts = osdc->client->options;
-	struct rbd_obj_request *obj_request;
+	struct rbd_device *rbd_dev = arg;
 	int ret;
 
-	obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
-					     OBJ_REQUEST_NODATA);
-	if (!obj_request)
-		return ERR_PTR(-ENOMEM);
-
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
-						  obj_request);
-	if (!obj_request->osd_req) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-			      rbd_dev->watch_event->cookie, 0, watch);
-	rbd_osd_req_format_write(obj_request);
-
-	if (watch)
-		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-
-	ret = rbd_obj_request_submit(osdc, obj_request);
-	if (ret)
-		goto out;
+	rbd_warn(rbd_dev, "encountered watch error: %d", err);
 
-	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
-	if (ret)
-		goto out;
+	__rbd_dev_header_unwatch_sync(rbd_dev);
 
-	ret = obj_request->result;
+	ret = rbd_dev_header_watch_sync(rbd_dev);
 	if (ret) {
-		if (watch)
-			rbd_obj_request_end(obj_request);
-		goto out;
+		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
+		return;
 	}
 
-	return obj_request;
-
-out:
-	rbd_obj_request_put(obj_request);
-	return ERR_PTR(ret);
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
 }
 
 /*
@@ -3215,57 +3140,33 @@ out:
 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	struct rbd_obj_request *obj_request;
-	int ret;
+	struct ceph_osd_linger_request *handle;
 
-	rbd_assert(!rbd_dev->watch_event);
-	rbd_assert(!rbd_dev->watch_request);
-
-	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
-				     &rbd_dev->watch_event);
-	if (ret < 0)
-		return ret;
-
-	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
-	if (IS_ERR(obj_request)) {
-		ceph_osdc_cancel_event(rbd_dev->watch_event);
-		rbd_dev->watch_event = NULL;
-		return PTR_ERR(obj_request);
-	}
+	rbd_assert(!rbd_dev->watch_handle);
 
-	/*
-	 * A watch request is set to linger, so the underlying osd
-	 * request won't go away until we unregister it.  We retain
-	 * a pointer to the object request during that time (in
-	 * rbd_dev->watch_request), so we'll keep a reference to it.
-	 * We'll drop that reference after we've unregistered it in
-	 * rbd_dev_header_unwatch_sync().
-	 */
-	rbd_dev->watch_request = obj_request;
+	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
+				 &rbd_dev->header_oloc, rbd_watch_cb,
+				 rbd_watch_errcb, rbd_dev);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
+	rbd_dev->watch_handle = handle;
 	return 0;
 }
 
 static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 {
-	struct rbd_obj_request *obj_request;
-
-	rbd_assert(rbd_dev->watch_event);
-	rbd_assert(rbd_dev->watch_request);
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
 
-	rbd_obj_request_end(rbd_dev->watch_request);
-	rbd_obj_request_put(rbd_dev->watch_request);
-	rbd_dev->watch_request = NULL;
+	if (!rbd_dev->watch_handle)
+		return;
 
-	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
-	if (!IS_ERR(obj_request))
-		rbd_obj_request_put(obj_request);
-	else
-		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
-			 PTR_ERR(obj_request));
+	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+	if (ret)
+		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
 
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
+	rbd_dev->watch_handle = NULL;
 }
 
 /*
@@ -4081,6 +3982,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	ceph_oid_init(&rbd_dev->header_oid);
+	ceph_oloc_init(&rbd_dev->header_oloc);
 
 	rbd_dev->dev.bus = &rbd_bus_type;
 	rbd_dev->dev.type = &rbd_device_type;
@@ -5285,6 +5187,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 
 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
+	rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	if (rbd_dev->image_format == 1)
 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
 				       spec->image_name, RBD_SUFFIX);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..3b911ff889dd 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 
 /* watch-notify operations */
 enum {
-  WATCH_NOTIFY				= 1, /* notifying watcher */
-  WATCH_NOTIFY_COMPLETE			= 2, /* notifier notified when done */
+	CEPH_WATCH_EVENT_NOTIFY		  = 1, /* notifying watcher */
+	CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+	CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
 };
 
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 342f22f1f040..cd2dcb8939de 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -34,7 +34,7 @@ struct ceph_osd {
 	struct rb_node o_node;
 	struct ceph_connection o_con;
 	struct rb_root o_requests;
-	struct list_head o_linger_requests;
+	struct rb_root o_linger_requests;
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
@@ -108,11 +108,12 @@ struct ceph_osd_req_op {
 		} cls;
 		struct {
 			u64 cookie;
-			u64 ver;
-			u32 prot_ver;
-			u32 timeout;
-			__u8 flag;
+			__u8 op;           /* CEPH_OSD_WATCH_OP_ */
+			u32 gen;
 		} watch;
+		struct {
+			struct ceph_osd_data request_data;
+		} notify_ack;
 		struct {
 			u64 expected_object_size;
 			u64 expected_write_size;
@@ -145,8 +146,6 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
-	struct list_head r_linger_item;
-	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
 
 	struct ceph_osd_request_target r_t;
@@ -162,7 +161,6 @@ struct ceph_osd_request {
 
 	int               r_result;
 	bool              r_got_reply;
-	int		  r_linger;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
@@ -181,6 +179,7 @@ struct ceph_osd_request {
 	struct ceph_snap_context *r_snapc;    /* for writes */
 	struct timespec r_mtime;              /* ditto */
 	u64 r_data_offset;                    /* ditto */
+	bool r_linger;                        /* don't resend on failure */
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
@@ -195,23 +194,40 @@ struct ceph_request_redirect {
 	struct ceph_object_locator oloc;
 };
 
-struct ceph_osd_event {
-	u64 cookie;
-	int one_shot;
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+				 u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
 	struct ceph_osd_client *osdc;
-	void (*cb)(u64, u64, u8, void *);
-	void *data;
-	struct rb_node node;
-	struct list_head osd_node;
+	u64 linger_id;
+	bool committed;
+
+	struct ceph_osd *osd;
+	struct ceph_osd_request *reg_req;
+	struct ceph_osd_request *ping_req;
+	unsigned long ping_sent;
+
+	struct ceph_osd_request_target t;
+	u32 last_force_resend;
+
+	struct timespec mtime;
+
 	struct kref kref;
-};
+	struct mutex lock;
+	struct rb_node node;            /* osd */
+	struct rb_node osdc_node;       /* osdc */
+	struct list_head scan_item;
+
+	struct completion reg_commit_wait;
+	int reg_commit_error;
+	int last_error;
+
+	u32 register_gen;
 
-struct ceph_osd_event_work {
-	struct work_struct work;
-	struct ceph_osd_event *event;
-        u64 ver;
-        u64 notify_id;
-        u8 opcode;
+	rados_watchcb2_t wcb;
+	rados_watcherrcb_t errcb;
+	void *data;
 };
 
 struct ceph_osd_client {
@@ -223,9 +239,10 @@ struct ceph_osd_client {
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
-	struct list_head       req_linger;    /* lingering requests */
 	struct ceph_osd        homeless_osd;
 	atomic64_t             last_tid;      /* tid of last request */
+	u64                    last_linger_id;
+	struct rb_root         linger_requests; /* lingering requests */
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
@@ -239,10 +256,6 @@ struct ceph_osd_client {
 	struct ceph_msgpool	msgpool_op;
 	struct ceph_msgpool	msgpool_op_reply;
 
-	spinlock_t		event_lock;
-	struct rb_root		event_tree;
-	u64			event_count;
-
 	struct workqueue_struct	*notify_wq;
 };
 
@@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 				 u16 opcode, const char *name, const void *value,
 				 size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-					unsigned int which, u16 opcode,
-					u64 cookie, u64 version, int flag);
 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				       unsigned int which,
 				       u64 expected_object_size,
@@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      u32 truncate_seq, u64 truncate_size,
 				      bool use_mempool);
 
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-					 struct ceph_osd_request *req);
-
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
@@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				struct timespec *mtime,
 				struct page **pages, int nr_pages);
 
-/* watch/notify events */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-				  void (*event_cb)(u64, u64, u8, void *),
-				  void *data, struct ceph_osd_event **pevent);
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+		struct ceph_object_id *oid,
+		struct ceph_object_locator *oloc,
+		rados_watchcb2_t wcb,
+		rados_watcherrcb_t errcb,
+		void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+		      struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+			 struct ceph_object_id *oid,
+			 struct ceph_object_locator *oloc,
+			 u64 notify_id,
+			 u64 cookie,
+			 void *payload,
+			 size_t payload_len);
 #endif
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 28740a58f32c..204c8c944703 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -427,7 +427,17 @@ enum {
 	CEPH_OSD_CMPXATTR_MODE_U64    = 2
 };
 
-#define RADOS_NOTIFY_VER	1
+enum {
+	CEPH_OSD_WATCH_OP_UNWATCH = 0,
+	CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+	/* note: use only ODD ids to prevent pre-giant code from
+	   interpreting the op as UNWATCH */
+	CEPH_OSD_WATCH_OP_WATCH = 3,
+	CEPH_OSD_WATCH_OP_RECONNECT = 5,
+	CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
 
 /*
  * an individual object operation.  each may be accompanied by some data
@@ -462,8 +472,9 @@ struct ceph_osd_op {
 	        } __attribute__ ((packed)) snap;
 		struct {
 			__le64 cookie;
-			__le64 ver;
-			__u8 flag;	/* 0 = unwatch, 1 = watch */
+			__le64 ver;     /* no longer used */
+			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
+			__le32 gen;     /* registration generation */
 		} __attribute__ ((packed)) watch;
 		struct {
 			__le64 offset, length;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
 	}
 }
 
+const char *ceph_osd_watch_op_name(int o)
+{
+	switch (o) {
+	case CEPH_OSD_WATCH_OP_UNWATCH:
+		return "unwatch";
+	case CEPH_OSD_WATCH_OP_WATCH:
+		return "watch";
+	case CEPH_OSD_WATCH_OP_RECONNECT:
+		return "reconnect";
+	case CEPH_OSD_WATCH_OP_PING:
+		return "ping";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_state_name(int s)
 {
 	switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 61dbd9de4650..e64cb8583533 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -177,6 +177,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 
 		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
 			   ceph_osd_op_name(op->op));
+		if (op->op == CEPH_OSD_OP_WATCH)
+			seq_printf(s, "-%s",
+				   ceph_osd_watch_op_name(op->watch.op));
 	}
 
 	seq_putc(s, '\n');
@@ -197,6 +200,31 @@ static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
 	mutex_unlock(&osd->lock);
 }
 
+static void dump_linger_request(struct seq_file *s,
+				struct ceph_osd_linger_request *lreq)
+{
+	seq_printf(s, "%llu\t", lreq->linger_id);
+	dump_target(s, &lreq->t);
+
+	seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen,
+		   lreq->committed ? "C" : "", lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		dump_linger_request(s, lreq);
+	}
+
+	mutex_unlock(&osd->lock);
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -214,6 +242,14 @@ static int osdc_show(struct seq_file *s, void *pp)
 	}
 	dump_requests(s, &osdc->homeless_osd);
 
+	seq_puts(s, "LINGER REQUESTS\n");
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		dump_linger_requests(s, osd);
+	}
+	dump_linger_requests(s, &osdc->homeless_osd);
+
 	up_read(&osdc->lock);
 	return 0;
 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ef1bcbe9af2d..ca0a7b58ba4f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -45,6 +45,10 @@ static const struct ceph_connection_operations osd_con_ops;
 
 static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
 static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+			struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+			  struct ceph_osd_linger_request *lreq);
 
 #if 1
 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
@@ -74,10 +78,15 @@ static inline void verify_osd_locked(struct ceph_osd *osd)
 		  rwsem_is_locked(&osdc->lock)) &&
 		!rwsem_is_wrlocked(&osdc->lock));
 }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+	WARN_ON(!mutex_is_locked(&lreq->lock));
+}
 #else
 static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
 static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
 static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
 #endif
 
 /*
@@ -322,6 +331,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_STAT:
 		ceph_osd_data_release(&op->raw_data_in);
 		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+		ceph_osd_data_release(&op->notify_ack.request_data);
+		break;
 	default:
 		break;
 	}
@@ -345,6 +357,29 @@ static void target_init(struct ceph_osd_request_target *t)
 	t->osd = CEPH_HOMELESS_OSD;
 }
 
+static void target_copy(struct ceph_osd_request_target *dest,
+			const struct ceph_osd_request_target *src)
+{
+	ceph_oid_copy(&dest->base_oid, &src->base_oid);
+	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+	ceph_oid_copy(&dest->target_oid, &src->target_oid);
+	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+	dest->pgid = src->pgid; /* struct */
+	dest->pg_num = src->pg_num;
+	dest->pg_num_mask = src->pg_num_mask;
+	ceph_osds_copy(&dest->acting, &src->acting);
+	ceph_osds_copy(&dest->up, &src->up);
+	dest->size = src->size;
+	dest->min_size = src->min_size;
+	dest->sort_bitwise = src->sort_bitwise;
+
+	dest->flags = src->flags;
+	dest->paused = src->paused;
+
+	dest->osd = src->osd;
+}
+
 static void target_destroy(struct ceph_osd_request_target *t)
 {
 	ceph_oid_destroy(&t->base_oid);
@@ -357,8 +392,6 @@ static void target_destroy(struct ceph_osd_request_target *t)
 static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-	WARN_ON(!list_empty(&req->r_linger_item));
-	WARN_ON(!list_empty(&req->r_linger_osd_item));
 	WARN_ON(!list_empty(&req->r_unsafe_item));
 	WARN_ON(req->r_osd);
 }
@@ -419,13 +452,48 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	RB_CLEAR_NODE(&req->r_node);
-	INIT_LIST_HEAD(&req->r_linger_item);
-	INIT_LIST_HEAD(&req->r_linger_osd_item);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 
 	target_init(&req->r_t);
 }
 
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	bool mempool = req->r_mempool;
+	unsigned int num_ops = req->r_num_ops;
+	u64 snapid = req->r_snapid;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	bool linger = req->r_linger;
+	struct ceph_msg *request_msg = req->r_request;
+	struct ceph_msg *reply_msg = req->r_reply;
+
+	dout("%s req %p\n", __func__, req);
+	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+	request_release_checks(req);
+
+	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+	target_destroy(&req->r_t);
+
+	request_init(req);
+	req->r_osdc = osdc;
+	req->r_mempool = mempool;
+	req->r_num_ops = num_ops;
+	req->r_snapid = snapid;
+	req->r_snapc = snapc;
+	req->r_linger = linger;
+	req->r_request = request_msg;
+	req->r_reply = reply_msg;
+}
+
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       unsigned int num_ops,
@@ -681,21 +749,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
 
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-				unsigned int which, u16 opcode,
-				u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+				  u64 cookie, u8 watch_opcode)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-						      opcode, 0);
-
-	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+	struct ceph_osd_req_op *op;
 
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
 	op->watch.cookie = cookie;
-	op->watch.ver = version;
-	if (opcode == CEPH_OSD_OP_WATCH && flag)
-		op->watch.flag = (u8)1;
+	op->watch.op = watch_opcode;
+	op->watch.gen = 0;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				unsigned int which,
@@ -771,11 +837,13 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
 		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-		dst->watch.ver = cpu_to_le64(src->watch.ver);
-		dst->watch.flag = src->watch.flag;
+		dst->watch.ver = cpu_to_le64(0);
+		dst->watch.op = src->watch.op;
+		dst->watch.gen = cpu_to_le32(src->watch.gen);
+		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
 		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
 		dst->alloc_hint.expected_object_size =
@@ -915,7 +983,7 @@ static void osd_init(struct ceph_osd *osd)
 	atomic_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
 	osd->o_requests = RB_ROOT;
-	INIT_LIST_HEAD(&osd->o_linger_requests);
+	osd->o_linger_requests = RB_ROOT;
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	osd->o_incarnation = 1;
@@ -926,7 +994,7 @@ static void osd_cleanup(struct ceph_osd *osd)
 {
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
 	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
-	WARN_ON(!list_empty(&osd->o_linger_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
 
@@ -996,7 +1064,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd)
 static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
 	if (RB_EMPTY_ROOT(&osd->o_requests) &&
-	    list_empty(&osd->o_linger_requests))
+	    RB_EMPTY_ROOT(&osd->o_linger_requests))
 		__move_osd_to_lru(osd);
 }
 
@@ -1036,6 +1104,17 @@ static void close_osd(struct ceph_osd *osd)
 		unlink_request(osd, req);
 		link_request(&osdc->homeless_osd, req);
 	}
+	for (n = rb_first(&osd->o_linger_requests); n; ) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		n = rb_next(n); /* unlink_linger() */
+
+		dout(" reassigning lreq %p linger_id %llu\n", lreq,
+		     lreq->linger_id);
+		unlink_linger(osd, lreq);
+		link_linger(&osdc->homeless_osd, lreq);
+	}
 
 	__remove_osd_from_lru(osd);
 	erase_osd(&osdc->osds, osd);
@@ -1052,7 +1131,7 @@ static int reopen_osd(struct ceph_osd *osd)
 	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
 
 	if (RB_EMPTY_ROOT(&osd->o_requests) &&
-	    list_empty(&osd->o_linger_requests)) {
+	    RB_EMPTY_ROOT(&osd->o_linger_requests)) {
 		close_osd(osd);
 		return -ENODEV;
 	}
@@ -1148,52 +1227,6 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 		atomic_dec(&osd->o_osdc->num_homeless);
 }
 
-static void __register_linger_request(struct ceph_osd *osd,
-				    struct ceph_osd_request *req)
-{
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-	WARN_ON(!req->r_linger);
-
-	ceph_osdc_get_request(req);
-	list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger);
-	list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests);
-	__remove_osd_from_lru(osd);
-	req->r_osd = osd;
-}
-
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req)
-{
-	WARN_ON(!req->r_linger);
-
-	if (list_empty(&req->r_linger_item)) {
-		dout("%s %p tid %llu not registered\n", __func__, req,
-		     req->r_tid);
-		return;
-	}
-
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-	list_del_init(&req->r_linger_item);
-
-	if (req->r_osd) {
-		list_del_init(&req->r_linger_osd_item);
-		maybe_move_osd_to_lru(req->r_osd);
-		if (RB_EMPTY_ROOT(&req->r_osd->o_requests))
-			req->r_osd = NULL;
-	}
-	ceph_osdc_put_request(req);
-}
-
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-				  struct ceph_osd_request *req)
-{
-	if (!req->r_linger) {
-		dout("set_request_linger %p\n", req);
-		req->r_linger = 1;
-	}
-}
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
-
 static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
 	return pi->flags & CEPH_POOL_FLAG_FULL;
@@ -1379,6 +1412,10 @@ static void setup_request_data(struct ceph_osd_request *req,
 						  op->xattr.value_len);
 			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
 			break;
+		case CEPH_OSD_OP_NOTIFY_ACK:
+			ceph_osdc_msg_data_add(msg,
+					       &op->notify_ack.request_data);
+			break;
 
 		/* reply */
 		case CEPH_OSD_OP_STAT:
@@ -1683,6 +1720,460 @@ static void cancel_request(struct ceph_osd_request *req)
 	finish_request(req);
 }
 
+/*
+ * lingering requests, watch/notify v2 infrastructure
+ */
+static void linger_release(struct kref *kref)
+{
+	struct ceph_osd_linger_request *lreq =
+	    container_of(kref, struct ceph_osd_linger_request, kref);
+
+	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+	     lreq->reg_req, lreq->ping_req);
+	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+	WARN_ON(!list_empty(&lreq->scan_item));
+	WARN_ON(lreq->osd);
+
+	if (lreq->reg_req)
+		ceph_osdc_put_request(lreq->reg_req);
+	if (lreq->ping_req)
+		ceph_osdc_put_request(lreq->ping_req);
+	target_destroy(&lreq->t);
+	kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+	if (lreq)
+		kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+	kref_get(&lreq->kref);
+	return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_linger_request *lreq;
+
+	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+	if (!lreq)
+		return NULL;
+
+	kref_init(&lreq->kref);
+	mutex_init(&lreq->lock);
+	RB_CLEAR_NODE(&lreq->node);
+	RB_CLEAR_NODE(&lreq->osdc_node);
+	INIT_LIST_HEAD(&lreq->scan_item);
+	init_completion(&lreq->reg_commit_wait);
+
+	lreq->osdc = osdc;
+	target_init(&lreq->t);
+
+	dout("%s lreq %p\n", __func__, lreq);
+	return lreq;
+}
+
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+
+/*
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
+ */
+static void link_linger(struct ceph_osd *osd,
+			struct ceph_osd_linger_request *lreq)
+{
+	verify_osd_locked(osd);
+	WARN_ON(!lreq->linger_id || lreq->osd);
+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+	     osd->o_osd, lreq, lreq->linger_id);
+
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_linger(&osd->o_linger_requests, lreq);
+	lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+			  struct ceph_osd_linger_request *lreq)
+{
+	verify_osd_locked(osd);
+	WARN_ON(lreq->osd != osd);
+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+	     osd->o_osd, lreq, lreq->linger_id);
+
+	lreq->osd = NULL;
+	erase_linger(&osd->o_linger_requests, lreq);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+	verify_osdc_locked(lreq->osdc);
+
+	return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	bool registered;
+
+	down_read(&osdc->lock);
+	registered = __linger_registered(lreq);
+	up_read(&osdc->lock);
+
+	return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(lreq->linger_id);
+
+	linger_get(lreq);
+	lreq->linger_id = ++osdc->last_linger_id;
+	insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_osdc_wrlocked(osdc);
+
+	erase_linger_osdc(&osdc->linger_requests, lreq);
+	linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	WARN_ON(!req->r_linger);
+	cancel_request(req);
+	linger_put(lreq);
+}
+
+struct linger_work {
+	struct work_struct work;
+	struct ceph_osd_linger_request *lreq;
+
+	union {
+		struct {
+			u64 notify_id;
+			u64 notifier_id;
+			void *payload; /* points into @msg front */
+			size_t payload_len;
+
+			struct ceph_msg *msg; /* for ceph_msg_put() */
+		} notify;
+		struct {
+			int err;
+		} error;
+	};
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+				       work_func_t workfn)
+{
+	struct linger_work *lwork;
+
+	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+	if (!lwork)
+		return NULL;
+
+	INIT_WORK(&lwork->work, workfn);
+	lwork->lreq = linger_get(lreq);
+
+	return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	linger_put(lreq);
+	kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_lreq_locked(lreq);
+	queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+	struct linger_work *lwork = container_of(w, struct linger_work, work);
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	if (!linger_registered(lreq)) {
+		dout("%s lreq %p not registered\n", __func__, lreq);
+		goto out;
+	}
+
+	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+	     __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+	     lwork->notify.payload_len);
+	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+		  lwork->notify.notifier_id, lwork->notify.payload,
+		  lwork->notify.payload_len);
+
+out:
+	ceph_msg_put(lwork->notify.msg);
+	lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+	struct linger_work *lwork = container_of(w, struct linger_work, work);
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	if (!linger_registered(lreq)) {
+		dout("%s lreq %p not registered\n", __func__, lreq);
+		goto out;
+	}
+
+	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+	lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+	struct linger_work *lwork;
+
+	lwork = lwork_alloc(lreq, do_watch_error);
+	if (!lwork) {
+		pr_err("failed to allocate error-lwork\n");
+		return;
+	}
+
+	lwork->error.err = lreq->last_error;
+	lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+				       int result)
+{
+	if (!completion_done(&lreq->reg_commit_wait)) {
+		lreq->reg_commit_error = (result <= 0 ? result : 0);
+		complete_all(&lreq->reg_commit_wait);
+	}
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+	     lreq->linger_id, req->r_result);
+	WARN_ON(!__linger_registered(lreq));
+	linger_reg_commit_complete(lreq, req->r_result);
+	lreq->committed = true;
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static int normalize_watch_error(int err)
+{
+	/*
+	 * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+	 * notification and a failure to reconnect because we raced with
+	 * the delete appear the same to the user.
+	 */
+	if (err == -ENOENT)
+		err = -ENOTCONN;
+
+	return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+	     lreq, lreq->linger_id, req->r_result, lreq->last_error);
+	if (req->r_result < 0) {
+		if (!lreq->last_error) {
+			lreq->last_error = normalize_watch_error(req->r_result);
+			queue_watch_error(lreq);
+		}
+	}
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_request *req = lreq->reg_req;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
+
+	verify_osdc_wrlocked(req->r_osdc);
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+
+	if (req->r_osd)
+		cancel_linger_request(req);
+
+	request_reinit(req);
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+	req->r_flags = lreq->t.flags;
+	req->r_mtime = lreq->mtime;
+
+	mutex_lock(&lreq->lock);
+	if (lreq->committed) {
+		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+			op->watch.cookie != lreq->linger_id);
+		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+		op->watch.gen = ++lreq->register_gen;
+		dout("lreq %p reconnect register_gen %u\n", lreq,
+		     op->watch.gen);
+		req->r_callback = linger_reconnect_cb;
+	} else {
+		WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+		dout("lreq %p register\n", lreq);
+		req->r_callback = linger_commit_cb;
+	}
+	mutex_unlock(&lreq->lock);
+
+	req->r_priv = linger_get(lreq);
+	req->r_linger = true;
+
+	submit_request(req, true);
+}
+
+static void linger_ping_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+	     lreq->last_error);
+	if (lreq->register_gen == req->r_ops[0].watch.gen) {
+		if (req->r_result && !lreq->last_error) {
+			lreq->last_error = normalize_watch_error(req->r_result);
+			queue_watch_error(lreq);
+		}
+	} else {
+		dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+		     lreq->register_gen, req->r_ops[0].watch.gen);
+	}
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_request *req = lreq->ping_req;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
+
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+		dout("%s PAUSERD\n", __func__);
+		return;
+	}
+
+	lreq->ping_sent = jiffies;
+	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+	     __func__, lreq, lreq->linger_id, lreq->ping_sent,
+	     lreq->register_gen);
+
+	if (req->r_osd)
+		cancel_linger_request(req);
+
+	request_reinit(req);
+	target_copy(&req->r_t, &lreq->t);
+
+	WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+		op->watch.cookie != lreq->linger_id ||
+		op->watch.op != CEPH_OSD_WATCH_OP_PING);
+	op->watch.gen = lreq->register_gen;
+	req->r_callback = linger_ping_cb;
+	req->r_priv = linger_get(lreq);
+	req->r_linger = true;
+
+	ceph_osdc_get_request(req);
+	account_request(req);
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(lreq->osd, req);
+	send_request(req);
+}
+
+static void linger_submit(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd *osd;
+
+	calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+	osd = lookup_create_osd(osdc, lreq->t.osd, true);
+	link_linger(osd, lreq);
+
+	send_linger(lreq);
+}
+
+/*
+ * @lreq has to be both registered and linked.
+ */
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+	if (lreq->ping_req->r_osd)
+		cancel_linger_request(lreq->ping_req);
+	if (lreq->reg_req->r_osd)
+		cancel_linger_request(lreq->reg_req);
+	unlink_linger(lreq->osd, lreq);
+	linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	down_write(&osdc->lock);
+	if (__linger_registered(lreq))
+		__linger_cancel(lreq);
+	up_write(&osdc->lock);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+	int ret;
+
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+	return ret ?: lreq->reg_commit_error;
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -1720,6 +2211,19 @@ static void handle_timeout(struct work_struct *work)
 				found = true;
 			}
 		}
+		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+			struct ceph_osd_linger_request *lreq =
+			    rb_entry(p, struct ceph_osd_linger_request, node);
+
+			dout(" lreq %p linger_id %llu is served by osd%d\n",
+			     lreq, lreq->linger_id, osd->o_osd);
+			found = true;
+
+			mutex_lock(&lreq->lock);
+			if (lreq->committed && !lreq->last_error)
+				send_linger_ping(lreq);
+			mutex_unlock(&lreq->lock);
+		}
 
 		if (found)
 			list_move_tail(&osd->o_keepalive_item, &slow_osds);
@@ -1756,7 +2260,7 @@ static void handle_osds_timeout(struct work_struct *work)
 			break;
 
 		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
-		WARN_ON(!list_empty(&osd->o_linger_requests));
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
 		close_osd(osd);
 	}
 
@@ -2082,7 +2586,8 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		__finish_request(req);
 		if (req->r_linger) {
 			WARN_ON(req->r_unsafe_callback);
-			__register_linger_request(osd, req);
+			dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+			__complete_request(req);
 		}
 	}
 
@@ -2093,7 +2598,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		if (already_acked && req->r_unsafe_callback) {
 			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
 			req->r_unsafe_callback(req, false);
-		} else {
+		} else if (!req->r_linger) {
 			dout("req %p tid %llu cb\n", req, req->r_tid);
 			__complete_request(req);
 		}
@@ -2145,6 +2650,26 @@ static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
 	return pi->was_full && !__pool_full(pi);
 }
 
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	enum calc_target_result ct_res;
+
+	ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+	if (ct_res == CALC_TARGET_NEED_RESEND) {
+		struct ceph_osd *osd;
+
+		osd = lookup_create_osd(osdc, lreq->t.osd, true);
+		if (osd != lreq->osd) {
+			unlink_linger(lreq->osd, lreq);
+			link_linger(osd, lreq);
+		}
+	}
+
+	return ct_res;
+}
+
 /*
  * Requeue requests whose mapping to an OSD has changed.
  */
@@ -2159,6 +2684,39 @@ static void scan_requests(struct ceph_osd *osd,
 	struct rb_node *n;
 	bool force_resend_writes;
 
+	for (n = rb_first(&osd->o_linger_requests); n; ) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* recalc_linger_target() */
+
+		dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+		     lreq->linger_id);
+		ct_res = recalc_linger_target(lreq);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+			if (!force_resend && !force_resend_writes)
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			/*
+			 * scan_requests() for the previous epoch(s)
+			 * may have already added it to the list, since
+			 * it's not unlinked here.
+			 */
+			if (list_empty(&lreq->scan_item))
+				list_add_tail(&lreq->scan_item, need_resend_linger);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			break;
+		}
+	}
+
 	for (n = rb_first(&osd->o_requests); n; ) {
 		struct ceph_osd_request *req =
 		    rb_entry(n, struct ceph_osd_request, r_node);
@@ -2263,6 +2821,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
 			  struct rb_root *need_resend,
 			  struct list_head *need_resend_linger)
 {
+	struct ceph_osd_linger_request *lreq, *nlreq;
 	struct rb_node *n;
 
 	for (n = rb_first(need_resend); n; ) {
@@ -2280,8 +2839,17 @@ static void kick_requests(struct ceph_osd_client *osdc,
 		if (!req->r_linger) {
 			if (!osd_homeless(osd) && !req->r_t.paused)
 				send_request(req);
+		} else {
+			cancel_linger_request(req);
 		}
 	}
+
+	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+		if (!osd_homeless(lreq->osd))
+			send_linger(lreq);
+
+		list_del_init(&lreq->scan_item);
+	}
 }
 
 /*
@@ -2406,15 +2974,25 @@ static void kick_osd_requests(struct ceph_osd *osd)
 {
 	struct rb_node *n;
 
-	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+	for (n = rb_first(&osd->o_requests); n; ) {
 		struct ceph_osd_request *req =
 		    rb_entry(n, struct ceph_osd_request, r_node);
 
+		n = rb_next(n); /* cancel_linger_request() */
+
 		if (!req->r_linger) {
 			if (!req->r_t.paused)
 				send_request(req);
+		} else {
+			cancel_linger_request(req);
 		}
 	}
+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		send_linger(lreq);
+	}
 }
 
 /*
@@ -2441,193 +3019,77 @@ out_unlock:
 	up_write(&osdc->lock);
 }
 
-/*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
- */
-static void __release_event(struct kref *kref)
-{
-	struct ceph_osd_event *event =
-		container_of(kref, struct ceph_osd_event, kref);
-
-	dout("__release_event %p\n", event);
-	kfree(event);
-}
-
-static void get_event(struct ceph_osd_event *event)
-{
-	kref_get(&event->kref);
-}
-
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
-	kref_put(&event->kref, __release_event);
-}
-EXPORT_SYMBOL(ceph_osdc_put_event);
-
-static void __insert_event(struct ceph_osd_client *osdc,
-			     struct ceph_osd_event *new)
-{
-	struct rb_node **p = &osdc->event_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_event *event = NULL;
-
-	while (*p) {
-		parent = *p;
-		event = rb_entry(parent, struct ceph_osd_event, node);
-		if (new->cookie < event->cookie)
-			p = &(*p)->rb_left;
-		else if (new->cookie > event->cookie)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &osdc->event_tree);
-}
-
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
-					        u64 cookie)
-{
-	struct rb_node **p = &osdc->event_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_event *event = NULL;
-
-	while (*p) {
-		parent = *p;
-		event = rb_entry(parent, struct ceph_osd_event, node);
-		if (cookie < event->cookie)
-			p = &(*p)->rb_left;
-		else if (cookie > event->cookie)
-			p = &(*p)->rb_right;
-		else
-			return event;
-	}
-	return NULL;
-}
-
-static void __remove_event(struct ceph_osd_event *event)
-{
-	struct ceph_osd_client *osdc = event->osdc;
-
-	if (!RB_EMPTY_NODE(&event->node)) {
-		dout("__remove_event removed %p\n", event);
-		rb_erase(&event->node, &osdc->event_tree);
-		ceph_osdc_put_event(event);
-	} else {
-		dout("__remove_event didn't remove %p\n", event);
-	}
-}
-
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-			   void (*event_cb)(u64, u64, u8, void *),
-			   void *data, struct ceph_osd_event **pevent)
-{
-	struct ceph_osd_event *event;
-
-	event = kmalloc(sizeof(*event), GFP_NOIO);
-	if (!event)
-		return -ENOMEM;
-
-	dout("create_event %p\n", event);
-	event->cb = event_cb;
-	event->one_shot = 0;
-	event->data = data;
-	event->osdc = osdc;
-	INIT_LIST_HEAD(&event->osd_node);
-	RB_CLEAR_NODE(&event->node);
-	kref_init(&event->kref);   /* one ref for us */
-	kref_get(&event->kref);    /* one ref for the caller */
-
-	spin_lock(&osdc->event_lock);
-	event->cookie = ++osdc->event_count;
-	__insert_event(osdc, event);
-	spin_unlock(&osdc->event_lock);
-
-	*pevent = event;
-	return 0;
-}
-EXPORT_SYMBOL(ceph_osdc_create_event);
-
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
-{
-	struct ceph_osd_client *osdc = event->osdc;
-
-	dout("cancel_event %p\n", event);
-	spin_lock(&osdc->event_lock);
-	__remove_event(event);
-	spin_unlock(&osdc->event_lock);
-	ceph_osdc_put_event(event); /* caller's */
-}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
-
-
-static void do_event_work(struct work_struct *work)
-{
-	struct ceph_osd_event_work *event_work =
-		container_of(work, struct ceph_osd_event_work, work);
-	struct ceph_osd_event *event = event_work->event;
-	u64 ver = event_work->ver;
-	u64 notify_id = event_work->notify_id;
-	u8 opcode = event_work->opcode;
-
-	dout("do_event_work completing %p\n", event);
-	event->cb(ver, notify_id, opcode, event->data);
-	dout("do_event_work completed %p\n", event);
-	ceph_osdc_put_event(event);
-	kfree(event_work);
-}
-
-
 /*
  * Process osd watch notifications
  */
 static void handle_watch_notify(struct ceph_osd_client *osdc,
 				struct ceph_msg *msg)
 {
-	void *p, *end;
-	u8 proto_ver;
-	u64 cookie, ver, notify_id;
-	u8 opcode;
-	struct ceph_osd_event *event;
-	struct ceph_osd_event_work *event_work;
-
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	struct ceph_osd_linger_request *lreq;
+	struct linger_work *lwork;
+	u8 proto_ver, opcode;
+	u64 cookie, notify_id;
+	u64 notifier_id = 0;
+	void *payload = NULL;
+	u32 payload_len = 0;
 
 	ceph_decode_8_safe(&p, end, proto_ver, bad);
 	ceph_decode_8_safe(&p, end, opcode, bad);
 	ceph_decode_64_safe(&p, end, cookie, bad);
-	ceph_decode_64_safe(&p, end, ver, bad);
+	p += 8; /* skip ver */
 	ceph_decode_64_safe(&p, end, notify_id, bad);
 
-	spin_lock(&osdc->event_lock);
-	event = __find_event(osdc, cookie);
-	if (event) {
-		BUG_ON(event->one_shot);
-		get_event(event);
-	}
-	spin_unlock(&osdc->event_lock);
-	dout("handle_watch_notify cookie %lld ver %lld event %p\n",
-	     cookie, ver, event);
-	if (event) {
-		event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-		if (!event_work) {
-			pr_err("couldn't allocate event_work\n");
-			ceph_osdc_put_event(event);
-			return;
+	if (proto_ver >= 1) {
+		ceph_decode_32_safe(&p, end, payload_len, bad);
+		ceph_decode_need(&p, end, payload_len, bad);
+		payload = p;
+		p += payload_len;
+	}
+
+	if (le16_to_cpu(msg->hdr.version) >= 2)
+		p += 4; /* skip return_code */
+
+	if (le16_to_cpu(msg->hdr.version) >= 3)
+		ceph_decode_64_safe(&p, end, notifier_id, bad);
+
+	down_read(&osdc->lock);
+	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+	if (!lreq) {
+		dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+		     cookie);
+		goto out_unlock_osdc;
+	}
+
+	mutex_lock(&lreq->lock);
+	dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie,
+	     lreq);
+	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+		if (!lreq->last_error) {
+			lreq->last_error = -ENOTCONN;
+			queue_watch_error(lreq);
+		}
+	} else {
+		/* CEPH_WATCH_EVENT_NOTIFY */
+		lwork = lwork_alloc(lreq, do_watch_notify);
+		if (!lwork) {
+			pr_err("failed to allocate notify-lwork\n");
+			goto out_unlock_lreq;
 		}
-		INIT_WORK(&event_work->work, do_event_work);
-		event_work->event = event;
-		event_work->ver = ver;
-		event_work->notify_id = notify_id;
-		event_work->opcode = opcode;
 
-		queue_work(osdc->notify_wq, &event_work->work);
+		lwork->notify.notify_id = notify_id;
+		lwork->notify.notifier_id = notifier_id;
+		lwork->notify.payload = payload;
+		lwork->notify.payload_len = payload_len;
+		lwork->notify.msg = ceph_msg_get(msg);
+		lwork_queue(lwork);
 	}
 
+out_unlock_lreq:
+	mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 	return;
 
 bad:
@@ -2659,8 +3121,6 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 	struct ceph_osd_client *osdc = req->r_osdc;
 
 	down_write(&osdc->lock);
-	if (req->r_linger)
-		__unregister_linger_request(osdc, req);
 	if (req->r_osd)
 		cancel_request(req);
 	up_write(&osdc->lock);
@@ -2743,6 +3203,198 @@ again:
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_request *req;
+
+	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return NULL;
+
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+
+	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+
+	return req;
+}
+
+/*
+ * Returns a handle, caller owns a ref.
+ */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+		struct ceph_object_id *oid,
+		struct ceph_object_locator *oloc,
+		rados_watchcb2_t wcb,
+		rados_watcherrcb_t errcb,
+		void *data)
+{
+	struct ceph_osd_linger_request *lreq;
+	int ret;
+
+	lreq = linger_alloc(osdc);
+	if (!lreq)
+		return ERR_PTR(-ENOMEM);
+
+	lreq->wcb = wcb;
+	lreq->errcb = errcb;
+	lreq->data = data;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	lreq->mtime = CURRENT_TIME;
+
+	lreq->reg_req = alloc_linger_request(lreq);
+	if (!lreq->reg_req) {
+		ret = -ENOMEM;
+		goto err_put_lreq;
+	}
+
+	lreq->ping_req = alloc_linger_request(lreq);
+	if (!lreq->ping_req) {
+		ret = -ENOMEM;
+		goto err_put_lreq;
+	}
+
+	down_write(&osdc->lock);
+	linger_register(lreq); /* before osd_req_op_* */
+	osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_WATCH);
+	osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_PING);
+	linger_submit(lreq);
+	up_write(&osdc->lock);
+
+	ret = linger_reg_commit_wait(lreq);
+	if (ret) {
+		linger_cancel(lreq);
+		goto err_put_lreq;
+	}
+
+	return lreq;
+
+err_put_lreq:
+	linger_put(lreq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ceph_osdc_watch);
+
+/*
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
+ */
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+		      struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_options *opts = osdc->client->options;
+	struct ceph_osd_request *req;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	req->r_mtime = CURRENT_TIME;
+	osd_req_op_watch_init(req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_UNWATCH);
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	ceph_osdc_start_request(osdc, req, false);
+	linger_cancel(lreq);
+	linger_put(lreq);
+	ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_unwatch);
+
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+				      u64 notify_id, u64 cookie, void *payload,
+				      size_t payload_len)
+{
+	struct ceph_osd_req_op *op;
+	struct ceph_pagelist *pl;
+	int ret;
+
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
+	if (!pl)
+		return -ENOMEM;
+
+	ceph_pagelist_init(pl);
+	ret = ceph_pagelist_encode_64(pl, notify_id);
+	ret |= ceph_pagelist_encode_64(pl, cookie);
+	if (payload) {
+		ret |= ceph_pagelist_encode_32(pl, payload_len);
+		ret |= ceph_pagelist_append(pl, payload, payload_len);
+	} else {
+		ret |= ceph_pagelist_encode_32(pl, 0);
+	}
+	if (ret) {
+		ceph_pagelist_release(pl);
+		return -ENOMEM;
+	}
+
+	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+	op->indata_len = pl->length;
+	return 0;
+}
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+			 struct ceph_object_id *oid,
+			 struct ceph_object_locator *oloc,
+			 u64 notify_id,
+			 u64 cookie,
+			 void *payload,
+			 size_t payload_len)
+{
+	struct ceph_osd_request *req;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = CEPH_OSD_FLAG_READ;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+					 payload_len);
+	if (ret)
+		goto out_put_req;
+
+	ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
 /*
  * Call all pending notify callbacks - for use after a watch is
  * unregistered, to make sure no more callbacks for it will be invoked
@@ -2767,15 +3419,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
 	spin_lock_init(&osdc->osd_lru_lock);
-	INIT_LIST_HEAD(&osdc->req_linger);
 	osd_init(&osdc->homeless_osd);
 	osdc->homeless_osd.o_osdc = osdc;
 	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+	osdc->linger_requests = RB_ROOT;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-	spin_lock_init(&osdc->event_lock);
-	osdc->event_tree = RB_ROOT;
-	osdc->event_count = 0;
 
 	err = -ENOMEM;
 	osdc->osdmap = ceph_osdmap_alloc();
@@ -2838,6 +3487,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	osd_cleanup(&osdc->homeless_osd);
 
 	WARN_ON(!list_empty(&osdc->osd_lru));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
-- 
cgit v1.2.3


From 1907920324f1f3ebb6618344417c03a2863bba01 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: support for sending notifies

Implement ceph_osdc_notify() for sending notifies.

Due to the fact that the current messenger can't do read-in into
pagelists (it can only do write-out from them), I had to go with a page
vector for a NOTIFY_COMPLETE payload, for now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  20 ++++
 include/linux/ceph/rados.h      |   3 +
 net/ceph/debugfs.c              |   5 +-
 net/ceph/osd_client.c           | 232 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 249 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cd2dcb8939de..63054fae4f15 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -114,6 +114,11 @@ struct ceph_osd_req_op {
 		struct {
 			struct ceph_osd_data request_data;
 		} notify_ack;
+		struct {
+			u64 cookie;
+			struct ceph_osd_data request_data;
+			struct ceph_osd_data response_data;
+		} notify;
 		struct {
 			u64 expected_object_size;
 			u64 expected_write_size;
@@ -202,6 +207,7 @@ struct ceph_osd_linger_request {
 	struct ceph_osd_client *osdc;
 	u64 linger_id;
 	bool committed;
+	bool is_watch;                  /* watch or notify */
 
 	struct ceph_osd *osd;
 	struct ceph_osd_request *reg_req;
@@ -220,14 +226,20 @@ struct ceph_osd_linger_request {
 	struct list_head scan_item;
 
 	struct completion reg_commit_wait;
+	struct completion notify_finish_wait;
 	int reg_commit_error;
+	int notify_finish_error;
 	int last_error;
 
 	u32 register_gen;
+	u64 notify_id;
 
 	rados_watchcb2_t wcb;
 	rados_watcherrcb_t errcb;
 	void *data;
+
+	struct page ***preply_pages;
+	size_t *preply_len;
 };
 
 struct ceph_osd_client {
@@ -397,5 +409,13 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
 			 u64 cookie,
 			 void *payload,
 			 size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+		     struct ceph_object_id *oid,
+		     struct ceph_object_locator *oloc,
+		     void *payload,
+		     size_t payload_len,
+		     u32 timeout,
+		     struct page ***preply_pages,
+		     size_t *preply_len);
 #endif
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 204c8c944703..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -476,6 +476,9 @@ struct ceph_osd_op {
 			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
 			__le32 gen;     /* registration generation */
 		} __attribute__ ((packed)) watch;
+		struct {
+			__le64 cookie;
+		} __attribute__ ((packed)) notify;
 		struct {
 			__le64 offset, length;
 			__le64 src_offset;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index e64cb8583533..39f91c7250f6 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -206,8 +206,9 @@ static void dump_linger_request(struct seq_file *s,
 	seq_printf(s, "%llu\t", lreq->linger_id);
 	dump_target(s, &lreq->t);
 
-	seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen,
-		   lreq->committed ? "C" : "", lreq->last_error);
+	seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+		   lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+		   lreq->last_error);
 }
 
 static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ca0a7b58ba4f..e6e3ab4223db 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -334,6 +334,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_NOTIFY_ACK:
 		ceph_osd_data_release(&op->notify_ack.request_data);
 		break;
+	case CEPH_OSD_OP_NOTIFY:
+		ceph_osd_data_release(&op->notify.request_data);
+		ceph_osd_data_release(&op->notify.response_data);
+		break;
 	default:
 		break;
 	}
@@ -845,6 +849,9 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 		break;
 	case CEPH_OSD_OP_NOTIFY_ACK:
 		break;
+	case CEPH_OSD_OP_NOTIFY:
+		dst->notify.cookie = cpu_to_le64(src->notify.cookie);
+		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
 		dst->alloc_hint.expected_object_size =
 		    cpu_to_le64(src->alloc_hint.expected_object_size);
@@ -1439,6 +1446,12 @@ static void setup_request_data(struct ceph_osd_request *req,
 			ceph_osdc_msg_data_add(req->r_reply,
 					       &op->cls.response_data);
 			break;
+		case CEPH_OSD_OP_NOTIFY:
+			ceph_osdc_msg_data_add(msg,
+					       &op->notify.request_data);
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->notify.response_data);
+			break;
 		}
 
 		data_len += op->indata_len;
@@ -1771,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 	RB_CLEAR_NODE(&lreq->osdc_node);
 	INIT_LIST_HEAD(&lreq->scan_item);
 	init_completion(&lreq->reg_commit_wait);
+	init_completion(&lreq->notify_finish_wait);
 
 	lreq->osdc = osdc;
 	target_init(&lreq->t);
@@ -1934,6 +1948,7 @@ static void do_watch_notify(struct work_struct *w)
 		goto out;
 	}
 
+	WARN_ON(!lreq->is_watch);
 	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
 	     __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
 	     lwork->notify.payload_len);
@@ -1997,6 +2012,24 @@ static void linger_commit_cb(struct ceph_osd_request *req)
 	linger_reg_commit_complete(lreq, req->r_result);
 	lreq->committed = true;
 
+	if (!lreq->is_watch) {
+		struct ceph_osd_data *osd_data =
+		    osd_req_op_data(req, 0, notify, response_data);
+		void *p = page_address(osd_data->pages[0]);
+
+		WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+			osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+		/* make note of the notify_id */
+		if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+			lreq->notify_id = ceph_decode_64(&p);
+			dout("lreq %p notify_id %llu\n", lreq,
+			     lreq->notify_id);
+		} else {
+			dout("lreq %p no notify_id\n", lreq);
+		}
+	}
+
 	mutex_unlock(&lreq->lock);
 	linger_put(lreq);
 }
@@ -2050,7 +2083,7 @@ static void send_linger(struct ceph_osd_linger_request *lreq)
 	req->r_mtime = lreq->mtime;
 
 	mutex_lock(&lreq->lock);
-	if (lreq->committed) {
+	if (lreq->is_watch && lreq->committed) {
 		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
 			op->watch.cookie != lreq->linger_id);
 		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
@@ -2059,7 +2092,10 @@ static void send_linger(struct ceph_osd_linger_request *lreq)
 		     op->watch.gen);
 		req->r_callback = linger_reconnect_cb;
 	} else {
-		WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+		if (!lreq->is_watch)
+			lreq->notify_id = 0;
+		else
+			WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
 		dout("lreq %p register\n", lreq);
 		req->r_callback = linger_commit_cb;
 	}
@@ -2147,7 +2183,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
  */
 static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-	if (lreq->ping_req->r_osd)
+	if (lreq->is_watch && lreq->ping_req->r_osd)
 		cancel_linger_request(lreq->ping_req);
 	if (lreq->reg_req->r_osd)
 		cancel_linger_request(lreq->reg_req);
@@ -2174,6 +2210,15 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
 	return ret ?: lreq->reg_commit_error;
 }
 
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+	int ret;
+
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+	return ret ?: lreq->notify_finish_error;
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -2220,7 +2265,7 @@ static void handle_timeout(struct work_struct *work)
 			found = true;
 
 			mutex_lock(&lreq->lock);
-			if (lreq->committed && !lreq->last_error)
+			if (lreq->is_watch && lreq->committed && !lreq->last_error)
 				send_linger_ping(lreq);
 			mutex_unlock(&lreq->lock);
 		}
@@ -3032,6 +3077,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	u8 proto_ver, opcode;
 	u64 cookie, notify_id;
 	u64 notifier_id = 0;
+	s32 return_code = 0;
 	void *payload = NULL;
 	u32 payload_len = 0;
 
@@ -3049,7 +3095,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	}
 
 	if (le16_to_cpu(msg->hdr.version) >= 2)
-		p += 4; /* skip return_code */
+		ceph_decode_32_safe(&p, end, return_code, bad);
 
 	if (le16_to_cpu(msg->hdr.version) >= 3)
 		ceph_decode_64_safe(&p, end, notifier_id, bad);
@@ -3063,13 +3109,38 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	}
 
 	mutex_lock(&lreq->lock);
-	dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie,
-	     lreq);
+	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+	     opcode, cookie, lreq, lreq->is_watch);
 	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
 		if (!lreq->last_error) {
 			lreq->last_error = -ENOTCONN;
 			queue_watch_error(lreq);
 		}
+	} else if (!lreq->is_watch) {
+		/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+		if (lreq->notify_id && lreq->notify_id != notify_id) {
+			dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+			     lreq->notify_id, notify_id);
+		} else if (!completion_done(&lreq->notify_finish_wait)) {
+			struct ceph_msg_data *data =
+			    list_first_entry_or_null(&msg->data,
+						     struct ceph_msg_data,
+						     links);
+
+			if (data) {
+				if (lreq->preply_pages) {
+					WARN_ON(data->type !=
+							CEPH_MSG_DATA_PAGES);
+					*lreq->preply_pages = data->pages;
+					*lreq->preply_len = data->length;
+				} else {
+					ceph_release_page_vector(data->pages,
+					       calc_pages_for(0, data->length));
+				}
+			}
+			lreq->notify_finish_error = return_code;
+			complete_all(&lreq->notify_finish_wait);
+		}
 	} else {
 		/* CEPH_WATCH_EVENT_NOTIFY */
 		lwork = lwork_alloc(lreq, do_watch_notify);
@@ -3241,6 +3312,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 	if (!lreq)
 		return ERR_PTR(-ENOMEM);
 
+	lreq->is_watch = true;
 	lreq->wcb = wcb;
 	lreq->errcb = errcb;
 	lreq->data = data;
@@ -3395,6 +3467,116 @@ out_put_req:
 }
 EXPORT_SYMBOL(ceph_osdc_notify_ack);
 
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+				  u64 cookie, u32 prot_ver, u32 timeout,
+				  void *payload, size_t payload_len)
+{
+	struct ceph_osd_req_op *op;
+	struct ceph_pagelist *pl;
+	int ret;
+
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+	op->notify.cookie = cookie;
+
+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
+	if (!pl)
+		return -ENOMEM;
+
+	ceph_pagelist_init(pl);
+	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+	ret |= ceph_pagelist_encode_32(pl, timeout);
+	ret |= ceph_pagelist_encode_32(pl, payload_len);
+	ret |= ceph_pagelist_append(pl, payload, payload_len);
+	if (ret) {
+		ceph_pagelist_release(pl);
+		return -ENOMEM;
+	}
+
+	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+	op->indata_len = pl->length;
+	return 0;
+}
+
+/*
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
+ */
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+		     struct ceph_object_id *oid,
+		     struct ceph_object_locator *oloc,
+		     void *payload,
+		     size_t payload_len,
+		     u32 timeout,
+		     struct page ***preply_pages,
+		     size_t *preply_len)
+{
+	struct ceph_osd_linger_request *lreq;
+	struct page **pages;
+	int ret;
+
+	WARN_ON(!timeout);
+	if (preply_pages) {
+		*preply_pages = NULL;
+		*preply_len = 0;
+	}
+
+	lreq = linger_alloc(osdc);
+	if (!lreq)
+		return -ENOMEM;
+
+	lreq->preply_pages = preply_pages;
+	lreq->preply_len = preply_len;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+	lreq->reg_req = alloc_linger_request(lreq);
+	if (!lreq->reg_req) {
+		ret = -ENOMEM;
+		goto out_put_lreq;
+	}
+
+	/* for notify_id */
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		goto out_put_lreq;
+	}
+
+	down_write(&osdc->lock);
+	linger_register(lreq); /* before osd_req_op_* */
+	ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+				     timeout, payload, payload_len);
+	if (ret) {
+		linger_unregister(lreq);
+		up_write(&osdc->lock);
+		ceph_release_page_vector(pages, 1);
+		goto out_put_lreq;
+	}
+	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+						 response_data),
+				 pages, PAGE_SIZE, 0, false, true);
+	linger_submit(lreq);
+	up_write(&osdc->lock);
+
+	ret = linger_reg_commit_wait(lreq);
+	if (!ret)
+		ret = linger_notify_finish_wait(lreq);
+	else
+		dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+	linger_cancel(lreq);
+out_put_lreq:
+	linger_put(lreq);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
 /*
  * Call all pending notify callbacks - for use after a watch is
  * unregistered, to make sure no more callbacks for it will be invoked
@@ -3693,19 +3875,51 @@ out_unlock_osdc:
 	return m;
 }
 
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+	struct ceph_msg *m;
+	int type = le16_to_cpu(hdr->type);
+	u32 front_len = le32_to_cpu(hdr->front_len);
+	u32 data_len = le32_to_cpu(hdr->data_len);
+
+	m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+	if (!m)
+		return NULL;
+
+	if (data_len) {
+		struct page **pages;
+		struct ceph_osd_data osd_data;
+
+		pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+					       GFP_NOIO);
+		if (!pages) {
+			ceph_msg_put(m);
+			return NULL;
+		}
+
+		ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+					 false);
+		ceph_osdc_msg_data_add(m, &osd_data);
+	}
+
+	return m;
+}
+
 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 				  struct ceph_msg_header *hdr,
 				  int *skip)
 {
 	struct ceph_osd *osd = con->private;
 	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
 
 	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 	case CEPH_MSG_WATCH_NOTIFY:
-		return ceph_msg_new(type, front, GFP_NOFS, false);
+		return alloc_msg_with_page_vector(hdr);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-- 
cgit v1.2.3


From b07d3c4bd7270c74e2b6803af8ac8a00cb3e5ed2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: support for checking on status of watch

Implement ceph_osdc_watch_check() to be able to check on status of
watch.  Note that the time it takes for a watch/notify event to get
delivered through the notify_wq is taken into account.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  4 ++++
 net/ceph/osd_client.c           | 52 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 63054fae4f15..2ae7cfd82ec9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -213,6 +213,8 @@ struct ceph_osd_linger_request {
 	struct ceph_osd_request *reg_req;
 	struct ceph_osd_request *ping_req;
 	unsigned long ping_sent;
+	unsigned long watch_valid_thru;
+	struct list_head pending_lworks;
 
 	struct ceph_osd_request_target t;
 	u32 last_force_resend;
@@ -417,5 +419,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
 		     u32 timeout,
 		     struct page ***preply_pages,
 		     size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+			  struct ceph_osd_linger_request *lreq);
 #endif
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e6e3ab4223db..5ac6dce74f07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1746,6 +1746,7 @@ static void linger_release(struct kref *kref)
 	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
 	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
 	WARN_ON(!list_empty(&lreq->scan_item));
+	WARN_ON(!list_empty(&lreq->pending_lworks));
 	WARN_ON(lreq->osd);
 
 	if (lreq->reg_req)
@@ -1783,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 	RB_CLEAR_NODE(&lreq->node);
 	RB_CLEAR_NODE(&lreq->osdc_node);
 	INIT_LIST_HEAD(&lreq->scan_item);
+	INIT_LIST_HEAD(&lreq->pending_lworks);
 	init_completion(&lreq->reg_commit_wait);
 	init_completion(&lreq->notify_finish_wait);
 
@@ -1890,6 +1892,8 @@ static void cancel_linger_request(struct ceph_osd_request *req)
 struct linger_work {
 	struct work_struct work;
 	struct ceph_osd_linger_request *lreq;
+	struct list_head pending_item;
+	unsigned long queued_stamp;
 
 	union {
 		struct {
@@ -1916,6 +1920,7 @@ static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
 		return NULL;
 
 	INIT_WORK(&lwork->work, workfn);
+	INIT_LIST_HEAD(&lwork->pending_item);
 	lwork->lreq = linger_get(lreq);
 
 	return lwork;
@@ -1925,6 +1930,10 @@ static void lwork_free(struct linger_work *lwork)
 {
 	struct ceph_osd_linger_request *lreq = lwork->lreq;
 
+	mutex_lock(&lreq->lock);
+	list_del(&lwork->pending_item);
+	mutex_unlock(&lreq->lock);
+
 	linger_put(lreq);
 	kfree(lwork);
 }
@@ -1935,6 +1944,10 @@ static void lwork_queue(struct linger_work *lwork)
 	struct ceph_osd_client *osdc = lreq->osdc;
 
 	verify_lreq_locked(lreq);
+	WARN_ON(!list_empty(&lwork->pending_item));
+
+	lwork->queued_stamp = jiffies;
+	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
 	queue_work(osdc->notify_wq, &lwork->work);
 }
 
@@ -2116,7 +2129,9 @@ static void linger_ping_cb(struct ceph_osd_request *req)
 	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
 	     lreq->last_error);
 	if (lreq->register_gen == req->r_ops[0].watch.gen) {
-		if (req->r_result && !lreq->last_error) {
+		if (!req->r_result) {
+			lreq->watch_valid_thru = lreq->ping_sent;
+		} else if (!lreq->last_error) {
 			lreq->last_error = normalize_watch_error(req->r_result);
 			queue_watch_error(lreq);
 		}
@@ -3316,6 +3331,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 	lreq->wcb = wcb;
 	lreq->errcb = errcb;
 	lreq->data = data;
+	lreq->watch_valid_thru = jiffies;
 
 	ceph_oid_copy(&lreq->t.base_oid, oid);
 	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
@@ -3577,6 +3593,40 @@ out_put_lreq:
 }
 EXPORT_SYMBOL(ceph_osdc_notify);
 
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+			  struct ceph_osd_linger_request *lreq)
+{
+	unsigned long stamp, age;
+	int ret;
+
+	down_read(&osdc->lock);
+	mutex_lock(&lreq->lock);
+	stamp = lreq->watch_valid_thru;
+	if (!list_empty(&lreq->pending_lworks)) {
+		struct linger_work *lwork =
+		    list_first_entry(&lreq->pending_lworks,
+				     struct linger_work,
+				     pending_item);
+
+		if (time_before(lwork->queued_stamp, stamp))
+			stamp = lwork->queued_stamp;
+	}
+	age = jiffies - stamp;
+	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+	     lreq, lreq->linger_id, age, lreq->last_error);
+	/* we are truncating to msecs, so return a safe upper bound */
+	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+	mutex_unlock(&lreq->lock);
+	up_read(&osdc->lock);
+	return ret;
+}
+
 /*
  * Call all pending notify callbacks - for use after a watch is
  * unregistered, to make sure no more callbacks for it will be invoked
-- 
cgit v1.2.3


From d0b19705e99939f5ae5aa9b57bfe41dd4777d951 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: async MON client generic requests

For map check, we are going to need to send CEPH_MSG_MON_GET_VERSION
messages asynchronously and get a callback on completion.  Refactor MON
client to allow firing off generic requests asynchronously and add an
async variant of ceph_monc_get_version().  ceph_monc_do_statfs() is
switched over and remains sync.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |   4 +-
 include/linux/ceph/mon_client.h |  19 ++-
 net/ceph/mon_client.c           | 316 ++++++++++++++++++++++++++--------------
 3 files changed, 228 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d0834c477f96..8eae6f56194d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4896,8 +4896,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 again:
 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
 	if (ret == -ENOENT && tries++ < 1) {
-		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
-					       &newest_epoch);
+		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
+					    &newest_epoch);
 		if (ret < 0)
 			return ret;
 
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index c14e9d861cda..19800d9b45f3 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
 	ceph_monc_request_func_t do_request;
 };
 
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
 /*
  * ceph_mon_generic_request is being used for the statfs and
  * mon_get_version requests which are being done a bit differently
  * because we need to get data back to the caller
  */
 struct ceph_mon_generic_request {
+	struct ceph_mon_client *monc;
 	struct kref kref;
 	u64 tid;
 	struct rb_node node;
 	int result;
-	void *buf;
+
 	struct completion completion;
+	ceph_monc_callback_t complete_cb;
+	u64 private_data;          /* r_tid/linger_id */
+
 	struct ceph_msg *request;  /* original request */
 	struct ceph_msg *reply;    /* and reply */
+
+	union {
+		struct ceph_statfs *st;
+		u64 newest;
+	} u;
 };
 
 struct ceph_mon_client {
@@ -124,8 +135,10 @@ extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
 			       struct ceph_statfs *buf);
 
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
-				    const char *what, u64 *newest);
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			  u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+				ceph_monc_callback_t cb, u64 private_data);
 
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 98bfbe1f6807..4e49b2296920 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -493,6 +493,10 @@ static void release_generic_request(struct kref *kref)
 	struct ceph_mon_generic_request *req =
 		container_of(kref, struct ceph_mon_generic_request, kref);
 
+	dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+	     req->reply);
+	WARN_ON(!RB_EMPTY_NODE(&req->node));
+
 	if (req->reply)
 		ceph_msg_put(req->reply);
 	if (req->request)
@@ -503,7 +507,8 @@ static void release_generic_request(struct kref *kref)
 
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-	kref_put(&req->kref, release_generic_request);
+	if (req)
+		kref_put(&req->kref, release_generic_request);
 }
 
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -511,6 +516,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
 	kref_get(&req->kref);
 }
 
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+	struct ceph_mon_generic_request *req;
+
+	req = kzalloc(sizeof(*req), gfp);
+	if (!req)
+		return NULL;
+
+	req->monc = monc;
+	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
+	init_completion(&req->completion);
+
+	dout("%s greq %p\n", __func__, req);
+	return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+
+	WARN_ON(req->tid);
+
+	get_generic_request(req);
+	req->tid = ++monc->last_tid;
+	insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+				 struct ceph_mon_generic_request *req)
+{
+	WARN_ON(!req->tid);
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	erase_generic_request(&monc->generic_request_tree, req);
+
+	ceph_msg_revoke(req->request);
+	ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+	__finish_generic_request(req);
+	put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+	if (req->complete_cb)
+		req->complete_cb(req);
+	else
+		complete_all(&req->completion);
+	put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+	struct ceph_mon_generic_request *lookup_req;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+	mutex_lock(&monc->mutex);
+	lookup_req = lookup_generic_request(&monc->generic_request_tree,
+					    req->tid);
+	if (lookup_req) {
+		WARN_ON(lookup_req != req);
+		finish_generic_request(req);
+	}
+
+	mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+	int ret;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	ret = wait_for_completion_interruptible(&req->completion);
+	if (ret)
+		cancel_generic_request(req);
+	else
+		ret = req->result; /* completed */
+
+	return ret;
+}
+
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 					 struct ceph_msg_header *hdr,
 					 int *skip)
@@ -540,40 +642,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	return m;
 }
 
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-				struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	/* register request */
-	req->tid = tid != 0 ? tid : ++monc->last_tid;
-	req->request->hdr.tid = cpu_to_le64(req->tid);
-	insert_generic_request(&monc->generic_request_tree, req);
-	ceph_con_send(&monc->con, ceph_msg_get(req->request));
-	mutex_unlock(&monc->mutex);
-
-	err = wait_for_completion_interruptible(&req->completion);
-
-	mutex_lock(&monc->mutex);
-	erase_generic_request(&monc->generic_request_tree, req);
-
-	if (!err)
-		err = req->result;
-	return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-			      struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	mutex_lock(&monc->mutex);
-	err = __do_generic_request(monc, 0, req);
-	mutex_unlock(&monc->mutex);
-
-	return err;
-}
-
 /*
  * statfs
  */
@@ -584,22 +652,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
-	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
 	req = lookup_generic_request(&monc->generic_request_tree, tid);
-	if (req) {
-		*(struct ceph_statfs *)req->buf = reply->st;
-		req->result = 0;
-		get_generic_request(req);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
 	}
+
+	req->result = 0;
+	*req->u.st = reply->st; /* struct */
+	__finish_generic_request(req);
 	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
+
+	complete_generic_request(req);
 	return;
 
 bad:
@@ -614,39 +684,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
-	int err;
+	int ret = -ENOMEM;
 
-	req = kzalloc(sizeof(*req), GFP_NOFS);
+	req = alloc_generic_request(monc, GFP_NOFS);
 	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	RB_CLEAR_NODE(&req->node);
-	req->buf = buf;
-	init_completion(&req->completion);
+		goto out;
 
-	err = -ENOMEM;
 	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 				    true);
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-				  true);
+
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
 	if (!req->reply)
 		goto out;
 
+	req->u.st = buf;
+
+	mutex_lock(&monc->mutex);
+	register_generic_request(req);
 	/* fill out request */
 	h = req->request->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
 
-	err = do_generic_request(monc, req);
-
+	ret = wait_generic_request(req);
 out:
 	put_generic_request(req);
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 
@@ -659,7 +728,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 	void *end = p + msg->front_alloc_len;
 	u64 handle;
 
-	dout("%s %p tid %llu\n", __func__, msg, tid);
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
 	ceph_decode_need(&p, end, 2*sizeof(u64), bad);
 	handle = ceph_decode_64(&p);
@@ -668,77 +737,110 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 
 	mutex_lock(&monc->mutex);
 	req = lookup_generic_request(&monc->generic_request_tree, handle);
-	if (req) {
-		*(u64 *)req->buf = ceph_decode_64(&p);
-		req->result = 0;
-		get_generic_request(req);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
 	}
+
+	req->result = 0;
+	req->u.newest = ceph_decode_64(&p);
+	__finish_generic_request(req);
 	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
 
+	complete_generic_request(req);
 	return;
+
 bad:
 	pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-			     u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			ceph_monc_callback_t cb, u64 private_data)
 {
 	struct ceph_mon_generic_request *req;
-	void *p, *end;
-	u64 tid;
-	int err;
 
-	req = kzalloc(sizeof(*req), GFP_NOFS);
+	req = alloc_generic_request(monc, GFP_NOIO);
 	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	RB_CLEAR_NODE(&req->node);
-	req->buf = newest;
-	init_completion(&req->completion);
+		goto err_put_req;
 
 	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
 				    sizeof(u64) + sizeof(u32) + strlen(what),
-				    GFP_NOFS, true);
-	if (!req->request) {
-		err = -ENOMEM;
-		goto out;
-	}
+				    GFP_NOIO, true);
+	if (!req->request)
+		goto err_put_req;
 
-	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
-				  GFP_NOFS, true);
-	if (!req->reply) {
-		err = -ENOMEM;
-		goto out;
-	}
+	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+				  true);
+	if (!req->reply)
+		goto err_put_req;
 
-	p = req->request->front.iov_base;
-	end = p + req->request->front_alloc_len;
+	req->complete_cb = cb;
+	req->private_data = private_data;
 
-	/* fill out request */
 	mutex_lock(&monc->mutex);
-	tid = ++monc->last_tid;
-	ceph_encode_64(&p, tid); /* handle */
-	ceph_encode_string(&p, end, what, strlen(what));
+	register_generic_request(req);
+	{
+		void *p = req->request->front.iov_base;
+		void *const end = p + req->request->front_alloc_len;
+
+		ceph_encode_64(&p, req->tid); /* handle */
+		ceph_encode_string(&p, end, what, strlen(what));
+		WARN_ON(p != end);
+	}
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
 
-	err = __do_generic_request(monc, tid, req);
+	return req;
 
-	mutex_unlock(&monc->mutex);
-out:
+err_put_req:
 	put_generic_request(req);
-	return err;
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			  u64 *newest)
+{
+	struct ceph_mon_generic_request *req;
+	int ret;
+
+	req = __ceph_monc_get_version(monc, what, NULL, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	ret = wait_generic_request(req);
+	if (!ret)
+		*newest = req->u.newest;
+
+	put_generic_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+				ceph_monc_callback_t cb, u64 private_data)
+{
+	struct ceph_mon_generic_request *req;
+
+	req = __ceph_monc_get_version(monc, what, cb, private_data);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	put_generic_request(req);
+	return 0;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 
 /*
  * Resend pending generic requests.
@@ -923,6 +1025,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	ceph_auth_destroy(monc->auth);
 
+	WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
 	ceph_msg_put(monc->m_auth);
 	ceph_msg_put(monc->m_auth_reply);
 	ceph_msg_put(monc->m_subscribe);
-- 
cgit v1.2.3


From 4609245e2670e3698b083bcd9cc69a65b2b6f9a6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: pool deletion detection

This adds the "map check" infrastructure for sending osdmap version
checks on CALC_TARGET_POOL_DNE and completing in-flight requests with
-ENOENT if the target pool doesn't exist or has just been deleted.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |   6 +
 net/ceph/osd_client.c           | 248 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 248 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2ae7cfd82ec9..3e7bf72e4078 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -151,6 +151,7 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
+	struct rb_node  r_mc_node;          /* map check */
 	struct ceph_osd *r_osd;
 
 	struct ceph_osd_request_target r_t;
@@ -191,6 +192,7 @@ struct ceph_osd_request {
 	int r_attempts;
 	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
+	u32 r_map_dne_bound;
 
 	struct ceph_osd_req_op r_ops[];
 };
@@ -218,6 +220,7 @@ struct ceph_osd_linger_request {
 
 	struct ceph_osd_request_target t;
 	u32 last_force_resend;
+	u32 map_dne_bound;
 
 	struct timespec mtime;
 
@@ -225,6 +228,7 @@ struct ceph_osd_linger_request {
 	struct mutex lock;
 	struct rb_node node;            /* osd */
 	struct rb_node osdc_node;       /* osdc */
+	struct rb_node mc_node;         /* map check */
 	struct list_head scan_item;
 
 	struct completion reg_commit_wait;
@@ -257,6 +261,8 @@ struct ceph_osd_client {
 	atomic64_t             last_tid;      /* tid of last request */
 	u64                    last_linger_id;
 	struct rb_root         linger_requests; /* lingering requests */
+	struct rb_root         map_checks;
+	struct rb_root         linger_map_checks;
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5ac6dce74f07..ece2d10a1208 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -396,6 +396,7 @@ static void target_destroy(struct ceph_osd_request_target *t)
 static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
 	WARN_ON(!list_empty(&req->r_unsafe_item));
 	WARN_ON(req->r_osd);
 }
@@ -456,6 +457,7 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	RB_CLEAR_NODE(&req->r_node);
+	RB_CLEAR_NODE(&req->r_mc_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 
 	target_init(&req->r_t);
@@ -969,6 +971,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
 DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
 
 static bool osd_homeless(struct ceph_osd *osd)
 {
@@ -1601,10 +1604,13 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
+static void send_map_check(struct ceph_osd_request *req);
+
 static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
 	struct ceph_osd *osd;
+	enum calc_target_result ct_res;
 	bool need_send = false;
 	bool promoted = false;
 
@@ -1612,7 +1618,10 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
-	calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+		goto promote;
+
 	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
 	if (IS_ERR(osd)) {
 		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
@@ -1656,6 +1665,9 @@ again:
 		send_request(req);
 	mutex_unlock(&osd->lock);
 
+	if (ct_res == CALC_TARGET_POOL_DNE)
+		send_map_check(req);
+
 	if (promoted)
 		downgrade_write(&osdc->lock);
 	return;
@@ -1699,6 +1711,7 @@ static void __finish_request(struct ceph_osd_request *req)
 	verify_osd_locked(osd);
 	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 
+	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
 	unlink_request(osd, req);
 	atomic_dec(&osdc->num_requests);
 
@@ -1726,13 +1739,127 @@ static void __complete_request(struct ceph_osd_request *req)
 		complete_all(&req->r_completion);
 }
 
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+	req->r_result = err;
+	__finish_request(req);
+	__complete_request(req);
+	complete_all(&req->r_safe_completion);
+	ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_request *lookup_req;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+	if (!lookup_req)
+		return;
+
+	WARN_ON(lookup_req != req);
+	erase_request_mc(&osdc->map_checks, req);
+	ceph_osdc_put_request(req);
+}
+
 static void cancel_request(struct ceph_osd_request *req)
 {
 	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 
+	cancel_map_check(req);
 	finish_request(req);
 }
 
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(!map->epoch);
+
+	if (req->r_attempts) {
+		/*
+		 * We sent a request earlier, which means that
+		 * previously the pool existed, and now it does not
+		 * (i.e., it was deleted).
+		 */
+		req->r_map_dne_bound = map->epoch;
+		dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+		     req->r_tid);
+	} else {
+		dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+		     req, req->r_tid, req->r_map_dne_bound, map->epoch);
+	}
+
+	if (req->r_map_dne_bound) {
+		if (map->epoch >= req->r_map_dne_bound) {
+			/* we had a new enough map */
+			pr_info_ratelimited("tid %llu pool does not exist\n",
+					    req->r_tid);
+			complete_request(req, -ENOENT);
+		}
+	} else {
+		send_map_check(req);
+	}
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+	struct ceph_osd_request *req;
+	u64 tid = greq->private_data;
+
+	WARN_ON(greq->result || !greq->u.newest);
+
+	down_write(&osdc->lock);
+	req = lookup_request_mc(&osdc->map_checks, tid);
+	if (!req) {
+		dout("%s tid %llu dne\n", __func__, tid);
+		goto out_unlock;
+	}
+
+	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+	     req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+	if (!req->r_map_dne_bound)
+		req->r_map_dne_bound = greq->u.newest;
+	erase_request_mc(&osdc->map_checks, req);
+	check_pool_dne(req);
+
+	ceph_osdc_put_request(req);
+out_unlock:
+	up_write(&osdc->lock);
+}
+
+static void send_map_check(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_request *lookup_req;
+	int ret;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+	if (lookup_req) {
+		WARN_ON(lookup_req != req);
+		return;
+	}
+
+	ceph_osdc_get_request(req);
+	insert_request_mc(&osdc->map_checks, req);
+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+					  map_check_cb, req->r_tid);
+	WARN_ON(ret);
+}
+
 /*
  * lingering requests, watch/notify v2 infrastructure
  */
@@ -1745,6 +1872,7 @@ static void linger_release(struct kref *kref)
 	     lreq->reg_req, lreq->ping_req);
 	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
 	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
 	WARN_ON(!list_empty(&lreq->scan_item));
 	WARN_ON(!list_empty(&lreq->pending_lworks));
 	WARN_ON(lreq->osd);
@@ -1783,6 +1911,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 	mutex_init(&lreq->lock);
 	RB_CLEAR_NODE(&lreq->node);
 	RB_CLEAR_NODE(&lreq->osdc_node);
+	RB_CLEAR_NODE(&lreq->mc_node);
 	INIT_LIST_HEAD(&lreq->scan_item);
 	INIT_LIST_HEAD(&lreq->pending_lworks);
 	init_completion(&lreq->reg_commit_wait);
@@ -1797,6 +1926,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 
 DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
 DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
 
 /*
  * Create linger request <-> OSD session relation.
@@ -2193,6 +2323,23 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
 	send_linger(lreq);
 }
 
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_linger_request *lookup_lreq;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+				       lreq->linger_id);
+	if (!lookup_lreq)
+		return;
+
+	WARN_ON(lookup_lreq != lreq);
+	erase_linger_mc(&osdc->linger_map_checks, lreq);
+	linger_put(lreq);
+}
+
 /*
  * @lreq has to be both registered and linked.
  */
@@ -2202,6 +2349,7 @@ static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 		cancel_linger_request(lreq->ping_req);
 	if (lreq->reg_req->r_osd)
 		cancel_linger_request(lreq->reg_req);
+	cancel_linger_map_check(lreq);
 	unlink_linger(lreq->osd, lreq);
 	linger_unregister(lreq);
 }
@@ -2216,6 +2364,89 @@ static void linger_cancel(struct ceph_osd_linger_request *lreq)
 	up_write(&osdc->lock);
 }
 
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(!map->epoch);
+
+	if (lreq->register_gen) {
+		lreq->map_dne_bound = map->epoch;
+		dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+		     lreq, lreq->linger_id);
+	} else {
+		dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+		     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+		     map->epoch);
+	}
+
+	if (lreq->map_dne_bound) {
+		if (map->epoch >= lreq->map_dne_bound) {
+			/* we had a new enough map */
+			pr_info("linger_id %llu pool does not exist\n",
+				lreq->linger_id);
+			linger_reg_commit_complete(lreq, -ENOENT);
+			__linger_cancel(lreq);
+		}
+	} else {
+		send_linger_map_check(lreq);
+	}
+}
+
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+	struct ceph_osd_linger_request *lreq;
+	u64 linger_id = greq->private_data;
+
+	WARN_ON(greq->result || !greq->u.newest);
+
+	down_write(&osdc->lock);
+	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+	if (!lreq) {
+		dout("%s linger_id %llu dne\n", __func__, linger_id);
+		goto out_unlock;
+	}
+
+	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+	     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+	     greq->u.newest);
+	if (!lreq->map_dne_bound)
+		lreq->map_dne_bound = greq->u.newest;
+	erase_linger_mc(&osdc->linger_map_checks, lreq);
+	check_linger_pool_dne(lreq);
+
+	linger_put(lreq);
+out_unlock:
+	up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_linger_request *lookup_lreq;
+	int ret;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+				       lreq->linger_id);
+	if (lookup_lreq) {
+		WARN_ON(lookup_lreq != lreq);
+		return;
+	}
+
+	linger_get(lreq);
+	insert_linger_mc(&osdc->linger_map_checks, lreq);
+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+					  linger_map_check_cb, lreq->linger_id);
+	WARN_ON(ret);
+}
+
 static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
 {
 	int ret;
@@ -2677,10 +2908,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 	return;
 
 fail_request:
-	req->r_result = -EIO;
-	__finish_request(req);
-	__complete_request(req);
-	complete_all(&req->r_safe_completion);
+	complete_request(req, -EIO);
 out_unlock_session:
 	mutex_unlock(&osd->lock);
 out_unlock_osdc:
@@ -2764,6 +2992,7 @@ static void scan_requests(struct ceph_osd *osd,
 
 			/* fall through */
 		case CALC_TARGET_NEED_RESEND:
+			cancel_linger_map_check(lreq);
 			/*
 			 * scan_requests() for the previous epoch(s)
 			 * may have already added it to the list, since
@@ -2773,6 +3002,7 @@ static void scan_requests(struct ceph_osd *osd,
 				list_add_tail(&lreq->scan_item, need_resend_linger);
 			break;
 		case CALC_TARGET_POOL_DNE:
+			check_linger_pool_dne(lreq);
 			break;
 		}
 	}
@@ -2782,7 +3012,7 @@ static void scan_requests(struct ceph_osd *osd,
 		    rb_entry(n, struct ceph_osd_request, r_node);
 		enum calc_target_result ct_res;
 
-		n = rb_next(n); /* unlink_request() */
+		n = rb_next(n); /* unlink_request(), check_pool_dne() */
 
 		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 		ct_res = calc_target(osdc, &req->r_t,
@@ -2799,10 +3029,12 @@ static void scan_requests(struct ceph_osd *osd,
 
 			/* fall through */
 		case CALC_TARGET_NEED_RESEND:
+			cancel_map_check(req);
 			unlink_request(osd, req);
 			insert_request(need_resend, req);
 			break;
 		case CALC_TARGET_POOL_DNE:
+			check_pool_dne(req);
 			break;
 		}
 	}
@@ -3655,6 +3887,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->homeless_osd.o_osdc = osdc;
 	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
 	osdc->linger_requests = RB_ROOT;
+	osdc->map_checks = RB_ROOT;
+	osdc->linger_map_checks = RB_ROOT;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
 
@@ -3720,6 +3954,8 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 
 	WARN_ON(!list_empty(&osdc->osd_lru));
 	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
-- 
cgit v1.2.3


From 7cca78c9dcd1afa243e46edc31896730df85d2b5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:28 +0200
Subject: libceph: replace ceph_monc_request_next_osdmap()

... with a wrapper around maybe_request_map() - no need for two
osdmap-specific functions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  2 +-
 include/linux/ceph/mon_client.h |  1 -
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/mon_client.c           | 14 --------------
 net/ceph/osd_client.c           |  7 +++++++
 5 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8eae6f56194d..81666a56415e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4902,7 +4902,7 @@ again:
 			return ret;
 
 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
-			ceph_monc_request_next_osdmap(&rbdc->client->monc);
+			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
 						     newest_epoch,
 						     opts->mount_timeout);
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 19800d9b45f3..1d730993c3f8 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -128,7 +128,6 @@ bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
 void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 				 unsigned long timeout);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3e7bf72e4078..19b14862d3e0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -381,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 4e49b2296920..72a910bf7819 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -384,20 +384,6 @@ void ceph_monc_renew_subs(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_renew_subs);
 
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-	dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
-	mutex_lock(&monc->mutex);
-	if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
-				 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-		__send_subscribe(monc);
-	mutex_unlock(&monc->mutex);
-}
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
-
 /*
  * Wait for an osdmap with a given epoch.
  *
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ece2d10a1208..55cafd3a2ff0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3869,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
 
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+	down_read(&osdc->lock);
+	maybe_request_map(osdc);
+	up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 
 /*
  * init, shutdown
-- 
cgit v1.2.3


From 737cc81ead34bcef0b1f6ea8322228e4378cf21a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 00:05:01 +0200
Subject: libceph: support for subscribing to "mdsmap.<id>" maps

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h    |  2 ++
 include/linux/ceph/mon_client.h |  1 +
 net/ceph/debugfs.c              |  1 +
 net/ceph/mon_client.c           | 18 +++++++++++++-----
 4 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 3b911ff889dd..bae833d0d055 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -208,6 +208,8 @@ struct ceph_mon_subscribe_ack {
 	struct ceph_fsid fsid;
 } __attribute__ ((packed));
 
+#define CEPH_FS_CLUSTER_ID_NONE  -1
+
 /*
  * mdsmap flags
  */
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 1d730993c3f8..e2a92df08b47 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -96,6 +96,7 @@ struct ceph_mon_client {
 		bool want;
 		u32 have; /* epoch */
 	} subs[3];
+	int fs_cluster_id; /* "mdsmap.<id>" sub */
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_file;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6e434c75cd08..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -128,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
 					CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
 		seq_putc(s, '\n');
 	}
+	seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
 
 	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
 		__u16 op;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 72a910bf7819..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 	BUG_ON(num < 1); /* monmap sub is always there */
 	ceph_encode_32(&p, num);
 	for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-		const char *s = ceph_sub_str[i];
+		char buf[32];
+		int len;
 
 		if (!monc->subs[i].want)
 			continue;
 
-		dout("%s %s start %llu flags 0x%x\n", __func__, s,
+		len = sprintf(buf, "%s", ceph_sub_str[i]);
+		if (i == CEPH_SUB_MDSMAP &&
+		    monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+			len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+		dout("%s %s start %llu flags 0x%x\n", __func__, buf,
 		     le64_to_cpu(monc->subs[i].item.start),
 		     monc->subs[i].item.flags);
-		ceph_encode_string(&p, end, s, strlen(s));
+		ceph_encode_string(&p, end, buf, len);
 		memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
 		p += sizeof(monc->subs[i].item);
 	}
 
-	BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+	BUG_ON(p > end);
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 	ceph_msg_revoke(msg);
@@ -948,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	if (!monc->m_subscribe_ack)
 		goto out_auth;
 
-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
 					 true);
 	if (!monc->m_subscribe)
 		goto out_subscribe_ack;
@@ -974,6 +980,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->generic_request_tree = RB_ROOT;
 	monc->last_tid = 0;
 
+	monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
 	return 0;
 
 out_auth_reply:
-- 
cgit v1.2.3


From 956d39d631dbcf7b57854873a24e309047f2a7f5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 27 Apr 2016 17:48:30 +0800
Subject: ceph: define 'end/complete' in readdir reply as bit flags

Set a flag in readdir request, which indicates that client interprets
'end/complete' as bit flags. So that mds can reply additional flags in
readdir reply.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c                |  2 ++
 fs/ceph/mds_client.c         |  7 +++++--
 fs/ceph/mds_client.h         |  2 +-
 include/linux/ceph/ceph_fs.h | 12 ++++++++++++
 4 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 68530acea2c8..ebcbd1c946b4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -365,6 +365,8 @@ more:
 		req->r_readdir_cache_idx = fi->readdir_cache_idx;
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.flags =
+				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
 
 		req->r_inode = inode;
 		ihold(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6220d3caf7ab..1c2befcd24fb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,8 +181,11 @@ static int parse_reply_info_dir(void **p, void *end,
 
 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
 	num = ceph_decode_32(p);
-	info->dir_end = ceph_decode_8(p);
-	info->dir_complete = ceph_decode_8(p);
+	{
+		u16 flags = ceph_decode_16(p);
+		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+	}
 	if (num == 0)
 		goto done;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0b84f9c0afa3..2a865812a41b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -80,7 +80,7 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			u8                            dir_complete, dir_end;
+			bool			      dir_complete, dir_end;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index bae833d0d055..a811c5e98bfa 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -347,6 +347,17 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
 
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS	(1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END		(1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+
 union ceph_mds_request_args {
 	struct {
 		__le32 mask;                 /* CEPH_CAP_* */
@@ -364,6 +375,7 @@ union ceph_mds_request_args {
 		__le32 frag;                 /* which dir fragment */
 		__le32 max_entries;          /* how many dentries to grab */
 		__le32 max_bytes;
+		__le16 flags;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
-- 
cgit v1.2.3


From f3c4ebe65ea149ec892f94474233cfebe9cbe299 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 29 Apr 2016 11:27:30 +0800
Subject: ceph: using hash value to compose dentry offset

If MDS sorts dentries in dirfrag in hash order, we use hash value to
compose dentry offset. dentry offset is:

  (0xff << 52) | ((24 bits hash) << 28) |
  (the nth entry hash hash collision)

This offset is stable across directory fragmentation. This alos means
there is no need to reset readdir offset if directory get fragmented
in the middle of readdir.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c                | 140 ++++++++++++++++++++++++++++++++-----------
 fs/ceph/inode.c              |  31 ++++++++--
 fs/ceph/mds_client.c         |   1 +
 fs/ceph/mds_client.h         |   4 +-
 fs/ceph/super.h              |   6 +-
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 136 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e954ea2fb710..4850c3624a87 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -69,16 +69,42 @@ out_unlock:
 }
 
 /*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ *	(0xff << 52) | ((24 bits hash) << 28) |
+ *	(the nth entry has hash collision);
+ * - frag+name order;
+ *	((frag value) << 28) | (the nth entry in frag);
  */
+#define OFFSET_BITS	28
+#define OFFSET_MASK	((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER	(0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+	loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+	if (hash_order)
+		fpos |= HASH_ORDER;
+	return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+	return (p & HASH_ORDER) == HASH_ORDER;
+}
+
 static unsigned fpos_frag(loff_t p)
 {
-	return p >> 32;
+	return p >> OFFSET_BITS;
 }
+
+static unsigned fpos_hash(loff_t p)
+{
+	return ceph_frag_value(fpos_frag(p));
+}
+
 static unsigned fpos_off(loff_t p)
 {
-	return p & 0xffffffff;
+	return p & OFFSET_MASK;
 }
 
 static int fpos_cmp(loff_t l, loff_t r)
@@ -177,7 +203,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	u64 idx = 0;
 	int err = 0;
 
-	dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+	dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
 
 	/* search start position */
 	if (ctx->pos > 2) {
@@ -234,7 +260,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		spin_unlock(&dentry->d_lock);
 
 		if (emit_dentry) {
-			dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+			dout(" %llx dentry %p %pd %p\n", di->offset,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
@@ -269,6 +295,16 @@ out:
 	return err;
 }
 
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+	if (!fi->last_readdir)
+		return true;
+	if (is_hash_order(pos))
+		return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+	else
+		return fi->frag != fpos_frag(pos);
+}
+
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct ceph_file_info *fi = file->private_data;
@@ -276,7 +312,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	unsigned frag = fpos_frag(ctx->pos);
 	int i;
 	int err;
 	u32 ftype;
@@ -317,7 +352,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		err = __dcache_readdir(file, ctx, shared_gen);
 		if (err != -EAGAIN)
 			return err;
-		frag = fpos_frag(ctx->pos);
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
@@ -325,8 +359,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* proceed with a normal readdir */
 more:
 	/* do we have the correct frag content buffered? */
-	if (fi->frag != frag || fi->last_readdir == NULL) {
+	if (need_send_readdir(fi, ctx->pos)) {
 		struct ceph_mds_request *req;
+		unsigned frag;
 		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -336,6 +371,13 @@ more:
 			fi->last_readdir = NULL;
 		}
 
+		if (is_hash_order(ctx->pos)) {
+			frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+						NULL, NULL);
+		} else {
+			frag = fpos_frag(ctx->pos);
+		}
+
 		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
 		     ceph_vinop(inode), frag, fi->last_name);
 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -373,19 +415,23 @@ more:
 			ceph_mdsc_put_request(req);
 			return err;
 		}
-		dout("readdir got and parsed readdir result=%d"
-		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		dout("readdir got and parsed readdir result=%d on "
+		     "frag %x, end=%d, complete=%d, hash_order=%d\n",
+		     err, frag,
 		     (int)req->r_reply_info.dir_end,
-		     (int)req->r_reply_info.dir_complete);
-
+		     (int)req->r_reply_info.dir_complete,
+		     (int)req->r_reply_info.hash_order);
 
-		/* note next offset and last dentry name */
 		rinfo = &req->r_reply_info;
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 			frag = le32_to_cpu(rinfo->dir_dir->frag);
-			fi->next_offset = req->r_readdir_offset;
-			/* adjust ctx->pos to beginning of frag */
-			ctx->pos = ceph_make_fpos(frag, fi->next_offset);
+			if (!rinfo->hash_order) {
+				fi->next_offset = req->r_readdir_offset;
+				/* adjust ctx->pos to beginning of frag */
+				ctx->pos = ceph_make_fpos(frag,
+							  fi->next_offset,
+							  false);
+			}
 		}
 
 		fi->frag = frag;
@@ -411,23 +457,25 @@ more:
 			fi->dir_release_count = 0;
 		}
 
-		if (req->r_reply_info.dir_end) {
-			kfree(fi->last_name);
-			fi->last_name = NULL;
-			fi->next_offset = 2;
-		} else {
+		/* note next offset and last dentry name */
+		if (rinfo->dir_nr > 0) {
 			struct ceph_mds_reply_dir_entry *rde =
 					rinfo->dir_entries + (rinfo->dir_nr-1);
+			unsigned next_offset = req->r_reply_info.dir_end ?
+					2 : (fpos_off(rde->offset) + 1);
 			err = note_last_dentry(fi, rde->name, rde->name_len,
-					       fpos_off(rde->offset) + 1);
+					       next_offset);
 			if (err)
 				return err;
+		} else if (req->r_reply_info.dir_end) {
+			fi->next_offset = 2;
+			/* keep last name */
 		}
 	}
 
 	rinfo = &fi->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-	     frag, rinfo->dir_nr, ctx->pos,
+	     fi->frag, rinfo->dir_nr, ctx->pos,
 	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
 
 	i = 0;
@@ -470,16 +518,26 @@ more:
 		ctx->pos++;
 	}
 
-	if (fi->last_name) {
+	if (fi->next_offset > 2) {
 		ceph_mdsc_put_request(fi->last_readdir);
 		fi->last_readdir = NULL;
 		goto more;
 	}
 
 	/* more frags? */
-	if (!ceph_frag_is_rightmost(frag)) {
-		frag = ceph_frag_next(frag);
-		ctx->pos = ceph_make_fpos(frag, 2);
+	if (!ceph_frag_is_rightmost(fi->frag)) {
+		unsigned frag = ceph_frag_next(fi->frag);
+		if (is_hash_order(ctx->pos)) {
+			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+							fi->next_offset, true);
+			if (new_pos > ctx->pos)
+				ctx->pos = new_pos;
+			/* keep last_name */
+		} else {
+			ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+		}
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
@@ -532,14 +590,21 @@ static void reset_readdir(struct ceph_file_info *fi)
 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
 {
 	struct ceph_mds_reply_info_parsed *rinfo;
+	loff_t chunk_offset;
 	if (new_pos == 0)
 		return true;
-	if (fpos_frag(new_pos) != fi->frag)
+	if (is_hash_order(new_pos)) {
+		/* no need to reset last_name for a forward seek when
+		 * dentries are sotred in hash order */
+	} else if (fi->frag |= fpos_frag(new_pos)) {
 		return true;
+	}
 	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
 	if (!rinfo || !rinfo->dir_nr)
 		return true;
-	return new_pos < rinfo->dir_entries[0].offset;;
+	chunk_offset = rinfo->dir_entries[0].offset;
+	return new_pos < chunk_offset ||
+	       is_hash_order(new_pos) != is_hash_order(chunk_offset);
 }
 
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
@@ -562,17 +627,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 	}
 
 	if (offset >= 0) {
+		if (need_reset_readdir(fi, offset)) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		} else if (is_hash_order(offset) && offset > file->f_pos) {
+			/* for hash offset, we don't know if a forward seek
+			 * is within same frag */
+			fi->dir_release_count = 0;
+			fi->readdir_cache_idx = -1;
+		}
+
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
 			fi->flags &= ~CEPH_F_ATEND;
 		}
 		retval = offset;
-
-		if (need_reset_readdir(fi, offset)) {
-			dout("dir_llseek dropping %p content\n", file);
-			reset_readdir(fi);
-		}
 	}
 out:
 	inode_unlock(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b53c95903aeb..f51b6fd5f570 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1387,6 +1387,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			     struct ceph_mds_session *session)
 {
 	struct dentry *parent = req->r_dentry;
+	struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct qstr dname;
 	struct dentry *dn;
@@ -1394,19 +1395,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	int err = 0, skipped = 0, ret, i;
 	struct inode *snapdir = NULL;
 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-	struct ceph_dentry_info *di;
 	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+	u32 last_hash = 0;
+	u32 fpos_offset;
 	struct ceph_readdir_cache_control cache_ctl = {};
 
 	if (req->r_aborted)
 		return readdir_prepopulate_inodes_only(req, session);
 
+	if (rinfo->hash_order && req->r_path2) {
+		last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+					  req->r_path2, strlen(req->r_path2));
+		last_hash = ceph_frag_value(last_hash);
+	}
+
 	if (rinfo->dir_dir &&
 	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 		dout("readdir_prepopulate got new frag %x -> %x\n",
 		     frag, le32_to_cpu(rinfo->dir_dir->frag));
 		frag = le32_to_cpu(rinfo->dir_dir->frag);
-		req->r_readdir_offset = 2;
+		if (!rinfo->hash_order)
+			req->r_readdir_offset = 2;
 	}
 
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1424,13 +1433,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
-		struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
 		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
 		req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
 		req->r_readdir_cache_idx = 0;
 	}
 
 	cache_ctl.index = req->r_readdir_cache_idx;
+	fpos_offset = req->r_readdir_offset;
 
 	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
@@ -1444,6 +1453,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 		vino.ino = le64_to_cpu(rde->inode.in->ino);
 		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
+		if (rinfo->hash_order) {
+			u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+						 rde->name, rde->name_len);
+			hash = ceph_frag_value(hash);
+			if (hash != last_hash)
+				fpos_offset = 2;
+			last_hash = hash;
+			rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+		} else {
+			rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+		}
+
 retry_lookup:
 		dn = d_lookup(parent, &dname);
 		dout("d_lookup on parent=%p name=%.*s got %p\n",
@@ -1521,9 +1542,7 @@ retry_lookup:
 			dn = realdn;
 		}
 
-		di = dn->d_fsdata;
-		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-		rde->offset = di->offset;
+		ceph_dentry(dn)->offset = rde->offset;
 
 		update_dentry_lease(dn, rde->lease, req->r_session,
 				    req->r_request_started);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 48def22fc7b9..7ad31283d510 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -185,6 +185,7 @@ static int parse_reply_info_dir(void **p, void *end,
 		u16 flags = ceph_decode_16(p);
 		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
 		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
 	}
 	if (num == 0)
 		goto done;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4ce19d852657..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -81,7 +81,9 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			bool			      dir_complete, dir_end;
+			bool			      dir_complete;
+			bool			      dir_end;
+			bool			      hash_order;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0628099ba1f2..c9b671dfff81 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -540,11 +540,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
 
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-	return ((loff_t)frag << 32) | (loff_t)off;
-}
-
 /*
  * caps helpers
  */
@@ -949,6 +944,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
 	ceph_snapdir_dentry_ops;
 
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
 			       struct dentry *dentry, int err);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index a811c5e98bfa..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -357,6 +357,7 @@ extern const char *ceph_mds_op_name(int op);
  */
 #define CEPH_READDIR_FRAG_END		(1<<0)
 #define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+#define CEPH_READDIR_HASH_ORDER		(1<<9)
 
 union ceph_mds_request_args {
 	struct {
-- 
cgit v1.2.3


From 3b33f692c84c28cc8178aaeeb9264d82b48787f1 Mon Sep 17 00:00:00 2001
From: Zhang Zhuoyu <zhangzhuoyu@cmss.chinamobile.com>
Date: Fri, 25 Mar 2016 05:18:39 -0400
Subject: ceph: make logical calculation functions return bool

This patch makes serverl logical caculation functions return bool to
improve readability due to these particular functions only using 0/1
as their return value.

No functional change.

Signed-off-by: Zhang Zhuoyu <zhangzhuoyu@cmss.chinamobile.com>
---
 fs/ceph/cache.c                | 2 +-
 fs/ceph/dir.c                  | 2 +-
 include/linux/ceph/ceph_frag.h | 4 ++--
 include/linux/ceph/decode.h    | 2 +-
 include/linux/ceph/osdmap.h    | 6 +++---
 net/ceph/ceph_common.c         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
 	unlock_page(page);
 }
 
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
 	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
 		(ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4850c3624a87..f6279a1bd6ec 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -710,7 +710,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 	return dentry;
 }
 
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
 	return ceph_ino(inode) == CEPH_INO_ROOT &&
 		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
 	return ceph_frag_make(newbits,
 			 ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
 	return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
 	return ceph_frag_value(f) == ceph_frag_mask(f);
 }
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
  * bounds check input.
  */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
 	return end >= *p && n <= end - *p;
 }
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 821e16fff39a..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -172,19 +172,19 @@ struct ceph_osdmap {
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
 	return osd >= 0 && osd < map->max_osd &&
 	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
 
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
 	return ceph_osd_exists(map, osd) &&
 	       (map->osd_state[osd] & CEPH_OSD_UP);
 }
 
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
 	return !ceph_osd_is_up(map, osd);
 }
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
 	return client->monc.monmap && client->monc.monmap->epoch &&
 	       client->osdc.osdmap && client->osdc.osdmap->epoch;
-- 
cgit v1.2.3


From 887bddfa90c79957d61067cd54a10087be0c8b23 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 26 May 2016 00:04:58 -0400
Subject: add down_write_killable_nested()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/rwsem.h  |  2 ++
 kernel/locking/rwsem.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index d1c12d160ace..d37fbb34d06f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -156,6 +156,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
 extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);
 
 # define down_write_nest_lock(sem, nest_lock)			\
@@ -176,6 +177,7 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
 # define down_read_nested(sem, subclass)		down_read(sem)
 # define down_write_nest_lock(sem, nest_lock)	down_write(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
+# define down_write_killable_nested(sem, subclass)	down_write_killable(sem)
 # define down_read_non_owner(sem)		down_read(sem)
 # define up_read_non_owner(sem)			up_read(sem)
 #endif
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c817216c1615..2e853ad93a3a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_owner(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
 void up_read_non_owner(struct rw_semaphore *sem)
 {
 	__up_read(sem);
-- 
cgit v1.2.3


From 868b2072f09c8a698df8066ca72d30411dcc57d6 Mon Sep 17 00:00:00 2001
From: Moritz Fischer <moritz.fischer@ettus.com>
Date: Mon, 23 May 2016 11:44:39 -0700
Subject: misc: at24: Fix typo in at24 header file

This commit fixes a simple typo s/mvmem/nvmem in the
example.

Signed-off-by: Moritz Fischer <moritz.fischer@ettus.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/platform_data/at24.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/at24.h b/include/linux/platform_data/at24.h
index dc9a13e5acda..be830b141d83 100644
--- a/include/linux/platform_data/at24.h
+++ b/include/linux/platform_data/at24.h
@@ -26,7 +26,7 @@
  *
  * An example in pseudo code for a setup() callback:
  *
- * void get_mac_addr(struct mvmem_device *nvmem, void *context)
+ * void get_mac_addr(struct nvmem_device *nvmem, void *context)
  * {
  *	u8 *mac_addr = ethernet_pdata->mac_addr;
  *	off_t offset = context;
-- 
cgit v1.2.3


From 50755bc1c305340660bbfa65fdae3ed113d8fe0e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 26 May 2016 15:16:06 -0700
Subject: seqlock: fix raw_read_seqcount_latch()

lockless_dereference() is supposed to take pointer not integer.

Link: http://lkml.kernel.org/r/20160521201448.GA7429@p183.telecom.by
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seqlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e0582106ef4f..7973a821ac58 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -277,7 +277,7 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
-	return lockless_dereference(s->sequence);
+	return lockless_dereference(s)->sequence;
 }
 
 /**
@@ -331,7 +331,7 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  *	unsigned seq, idx;
  *
  *	do {
- *		seq = lockless_dereference(latch->seq);
+ *		seq = lockless_dereference(latch)->seq;
  *
  *		idx = seq & 0x01;
  *		entry = data_query(latch->data[idx], ...);
-- 
cgit v1.2.3


From d96c84f8d27ce57ff08f12b9654d9f505a8cce6e Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 26 May 2016 15:16:14 -0700
Subject: mm: slub: remove unused virt_to_obj()

It's unused since commit 7ed2f9e66385 ("mm, kasan: SLAB support")

Link: http://lkml.kernel.org/r/1464020961-2242-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 665cd0cd18b8..d1faa019c02a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -111,22 +111,6 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 }
 #endif
 
-
-/**
- * virt_to_obj - returns address of the beginning of object.
- * @s: object's kmem_cache
- * @slab_page: address of slab page
- * @x: address within object memory range
- *
- * Returns address of the beginning of object
- */
-static inline void *virt_to_obj(struct kmem_cache *s,
-				const void *slab_page,
-				const void *x)
-{
-	return (void *)x - ((x - slab_page) % s->size);
-}
-
 void object_err(struct kmem_cache *s, struct page *page,
 		u8 *object, char *reason);
 
-- 
cgit v1.2.3


From 7ef949d77f95f0d129f0d404b336459a34a00101 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 26 May 2016 15:16:22 -0700
Subject: mm: oom_reaper: remove some bloat

mmput_async is currently used only from the oom_reaper which is defined
only for CONFIG_MMU.  We can save work_struct in mm_struct for
!CONFIG_MMU.

[akpm@linux-foundation.org: fix typo, per Minchan]
Link: http://lkml.kernel.org/r/20160520061658.GB19172@dhcp22.suse.cz
Reported-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 ++
 include/linux/sched.h    | 4 +++-
 kernel/fork.c            | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d553855503e6..ca3e517980a0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -514,7 +514,9 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+#ifdef CONFIG_MMU
 	struct work_struct async_put_work;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23e075dcdfe4..6e42ada26345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2745,10 +2745,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-/* same as above but performs the slow path from the async kontext. Can
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
 extern void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47887bba944f..5c2c355aa97f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
 static void mmput_async_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
@@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm)
 		schedule_work(&mm->async_put_work);
 	}
 }
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
-- 
cgit v1.2.3


From 5930122683dff58f0846b0f0405b4bd598a3ba6a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2016 10:19:30 -0400
Subject: switch xattr_handler->set() to passing dentry and inode separately

preparation for similar switch in ->setxattr() (see the next commit for
rationale).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c                  |  6 +++---
 fs/9p/xattr.c                |  5 +++--
 fs/btrfs/xattr.c             | 12 +++++-------
 fs/ceph/xattr.c              |  7 ++++---
 fs/cifs/xattr.c              |  9 +++++----
 fs/ext2/xattr_security.c     |  7 ++++---
 fs/ext2/xattr_trusted.c      |  7 ++++---
 fs/ext2/xattr_user.c         |  9 +++++----
 fs/ext4/xattr_security.c     |  7 ++++---
 fs/ext4/xattr_trusted.c      |  7 ++++---
 fs/ext4/xattr_user.c         |  9 +++++----
 fs/f2fs/xattr.c              | 12 ++++++------
 fs/gfs2/xattr.c              |  6 +++---
 fs/hfsplus/xattr.c           | 12 ++++++------
 fs/hfsplus/xattr.h           |  2 +-
 fs/hfsplus/xattr_security.c  |  7 ++++---
 fs/hfsplus/xattr_trusted.c   |  7 ++++---
 fs/hfsplus/xattr_user.c      |  7 ++++---
 fs/jffs2/security.c          |  7 ++++---
 fs/jffs2/xattr_trusted.c     |  7 ++++---
 fs/jffs2/xattr_user.c        |  7 ++++---
 fs/jfs/xattr.c               | 14 ++++++--------
 fs/nfs/nfs4proc.c            | 19 +++++++++----------
 fs/ocfs2/xattr.c             | 23 +++++++++++++----------
 fs/orangefs/xattr.c          | 10 ++++++----
 fs/posix_acl.c               |  6 +++---
 fs/reiserfs/xattr_security.c |  9 +++++----
 fs/reiserfs/xattr_trusted.c  |  9 +++++----
 fs/reiserfs/xattr_user.c     |  9 +++++----
 fs/ubifs/xattr.c             |  7 +++----
 fs/xattr.c                   |  6 ++++--
 fs/xfs/xfs_xattr.c           |  9 +++++----
 include/linux/xattr.h        |  4 ++--
 mm/shmem.c                   |  7 ++++---
 34 files changed, 156 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eb3589edf485..0576eaeb60b9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
-			      struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags)
+			      struct dentry *dentry, struct inode *inode,
+			      const char *name, const void *value,
+			      size_t size, int flags)
 {
 	int retval;
 	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
-	struct inode *inode = d_inode(dentry);
 
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 18c62bae9591..a6bd349bab23 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *value, size_t size, int flags)
+				  struct dentry *dentry, struct inode *inode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
 {
 	const char *full_name = xattr_full_name(handler, name);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3bfb252206c7..d1a177a3dbe8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *buffer, size_t size,
-				   int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *buffer,
+				   size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
-					struct dentry *dentry,
+					struct dentry *unused, struct inode *inode,
 					const char *name, const void *value,
 					size_t size, int flags)
 {
 	name = xattr_full_name(handler, name);
-	return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
+	return btrfs_set_prop(inode, name, value, size, flags);
 }
 
 static const struct xattr_handler btrfs_security_xattr_handler = {
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..2baa6939dfe6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1051,12 +1051,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
 }
 
 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *value, size_t size, int flags)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
 {
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
-	return __ceph_setxattr(d_inode(dentry), name, value, size, flags);
+	return __ceph_setxattr(inode, name, value, size, flags);
 }
 
 const struct xattr_handler ceph_other_xattr_handler = {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index c8b77aa24a1d..5e23f64c0804 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -39,8 +39,9 @@
 enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
 
 static int cifs_xattr_set(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+			  struct dentry *dentry, struct inode *inode,
+			  const char *name, const void *value,
+			  size_t size, int flags)
 {
 	int rc = -EOPNOTSUPP;
 	unsigned int xid;
@@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 			if (value &&
 			    pTcon->ses->server->ops->set_acl)
 				rc = pTcon->ses->server->ops->set_acl(pacl,
-						size, d_inode(dentry),
+						size, inode,
 						full_path, CIFS_ACL_DACL);
 			else
 				rc = -EOPNOTSUPP;
 			if (rc == 0) /* force revalidate of the inode */
-				CIFS_I(d_inode(dentry))->time = 0;
+				CIFS_I(inode)->time = 0;
 			kfree(pacl);
 		}
 #endif /* CONFIG_CIFS_ACL */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 7fd3b867ce65..7b9e9c1842d5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+			struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
 			      value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 0f85705ff519..65049b71af13 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_trusted_set(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       const void *value, size_t size, int flags)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, const void *value,
+		       size_t size, int flags)
 {
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 1fafd27037cc..fb2f992ae763 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_user_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 123a7d010efe..a8921112030d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+			struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 60652fa24cbc..c7765c735714 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_trusted_set(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       const void *value, size_t size, int flags)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, const void *value,
+		       size_t size, int flags)
 {
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 17a446ffecd3..ca20e423034b 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_user_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER,
 			      name, value, size, flags);
 }
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 00ea56797258..e3decae3acfb 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, const void *value,
+		struct dentry *unused, struct inode *inode,
+		const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
 	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
@@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+	return f2fs_setxattr(inode, handler->flags, name,
 					value, size, NULL, flags);
 }
 
@@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, const void *value,
+		struct dentry *unused, struct inode *inode,
+		const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value == NULL)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f42ab53bd30d..3a2853504084 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 }
 
 static int gfs2_xattr_set(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+			  struct dentry *unused, struct inode *inode,
+			  const char *name, const void *value,
+			  size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int ret;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4f118d282a7a..d37bb88dc746 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
 	return len;
 }
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
 		     const void *value, size_t size, int flags,
 		     const char *prefix, size_t prefixlen)
 {
@@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 		return -ENOMEM;
 	strcpy(xattr_name, prefix);
 	strcpy(xattr_name + prefixlen, name);
-	res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
-				 flags);
+	res = __hfsplus_setxattr(inode, xattr_name, value, size, flags);
 	kfree(xattr_name);
 	return res;
 }
@@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				const void *buffer, size_t size, int flags)
+				struct dentry *unused, struct inode *inode,
+				const char *name, const void *buffer,
+				size_t size, int flags)
 {
 	/*
 	 * Don't allow setting properly prefixed attributes
@@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
 	 * creates), so we pass the name through unmodified (after
 	 * ensuring it doesn't conflict with another namespace).
 	 */
-	return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+	return __hfsplus_setxattr(inode, name, buffer, size, flags);
 }
 
 const struct xattr_handler hfsplus_xattr_osx_handler = {
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d04ba6f58df2..68f6b539371f 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
 				   const void *value, size_t size, int flags,
 				   const char *prefix, size_t prefixlen);
 
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index ae2ca8c2e335..37b3efa733ef 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_security_setxattr(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *name,
-				     const void *buffer, size_t size, int flags)
+				     struct dentry *unused, struct inode *inode,
+				     const char *name, const void *buffer,
+				     size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN);
 }
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index eae2947060aa..94519d6c627d 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    const void *buffer, size_t size, int flags)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, const void *buffer,
+				    size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 3c9eec3e4c7b..fae6c0ea0030 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_user_setxattr(const struct xattr_handler *handler,
-				 struct dentry *dentry, const char *name,
-				 const void *buffer, size_t size, int flags)
+				 struct dentry *unused, struct inode *inode,
+				 const char *name, const void *buffer,
+				 size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 3ed9a4b49778..c2332e30f218 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_security_setxattr(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *buffer, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *buffer,
+				   size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 4ebecff1d922..5d6030826c52 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *buffer, size_t size, int flags)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, const void *buffer,
+				  size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index bce249e1b277..9d027b4abcf9 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_user_setxattr(const struct xattr_handler *handler,
-			       struct dentry *dentry, const char *name,
-			       const void *buffer, size_t size, int flags)
+			       struct dentry *unused, struct inode *inode,
+			       const char *name, const void *buffer,
+			       size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index beb182b503b3..0bf3c33aedff 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set(const struct xattr_handler *handler,
-			 struct dentry *dentry, const char *name,
-			 const void *value, size_t size, int flags)
+			 struct dentry *unused, struct inode *inode,
+			 const char *name, const void *value,
+			 size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __jfs_xattr_set(inode, name, value, size, flags);
 }
@@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set_os2(const struct xattr_handler *handler,
-			     struct dentry *dentry, const char *name,
-			     const void *value, size_t size, int flags)
+			     struct dentry *unused, struct inode *inode,
+			     const char *name, const void *value,
+			     size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (is_known_namespace(name))
 		return -EOPNOTSUPP;
 	return __jfs_xattr_set(inode, name, value, size, flags);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 084e8570da18..2e802ec47b8a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4993,12 +4993,11 @@ static int nfs4_do_set_security_label(struct inode *inode,
 }
 
 static int
-nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
 {
 	struct nfs4_label ilabel, *olabel = NULL;
 	struct nfs_fattr fattr;
 	struct rpc_cred *cred;
-	struct inode *inode = d_inode(dentry);
 	int status;
 
 	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -6255,11 +6254,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *key,
-				   const void *buf, size_t buflen,
-				   int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *key, const void *buf,
+				   size_t buflen, int flags)
 {
-	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+	return nfs4_proc_set_acl(inode, buf, buflen);
 }
 
 static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
@@ -6277,12 +6276,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *key,
-				     const void *buf, size_t buflen,
-				     int flags)
+				     struct dentry *unused, struct inode *inode,
+				     const char *key, const void *buf,
+				     size_t buflen, int flags)
 {
 	if (security_ismaclabel(key))
-		return nfs4_set_security_label(dentry, buf, buflen);
+		return nfs4_set_security_label(inode, buf, buflen);
 
 	return -EOPNOTSUPP;
 }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ad16995c9e7a..d2053853951e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    const void *value, size_t size, int flags)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, const void *value,
+				    size_t size, int flags)
 {
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
 			       name, value, size, flags);
 }
 
@@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *value, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
 {
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
 			       name, value, size, flags);
 }
 
@@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				const void *value, size_t size, int flags)
+				struct dentry *unused, struct inode *inode,
+				const char *name, const void *value,
+				size_t size, int flags)
 {
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
 			       name, value, size, flags);
 }
 
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 99c19545752c..5893ddde0e4b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -448,13 +448,14 @@ out_unlock:
 }
 
 static int orangefs_xattr_set_default(const struct xattr_handler *handler,
-				      struct dentry *dentry,
+				      struct dentry *unused,
+				      struct inode *inode,
 				      const char *name,
 				      const void *buffer,
 				      size_t size,
 				      int flags)
 {
-	return orangefs_inode_setxattr(dentry->d_inode,
+	return orangefs_inode_setxattr(inode,
 				    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
 				    name,
 				    buffer,
@@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler,
 }
 
 static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
-				     struct dentry *dentry,
+				     struct dentry *unused,
+				     struct inode *inode,
 				     const char *name,
 				     const void *buffer,
 				     size_t size,
 				     int flags)
 {
-	return orangefs_inode_setxattr(dentry->d_inode,
+	return orangefs_inode_setxattr(inode,
 				    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
 				    name,
 				    buffer,
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2c60f17e7d92..8a4a266beff3 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	struct inode *inode = d_backing_inode(dentry);
 	struct posix_acl *acl = NULL;
 	int ret;
 
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 86aeb9dd805a..e4cbb7719906 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-security_set(const struct xattr_handler *handler, struct dentry *dentry,
-	     const char *name, const void *buffer, size_t size, int flags)
+security_set(const struct xattr_handler *handler, struct dentry *unused,
+	     struct inode *inode, const char *name, const void *buffer,
+	     size_t size, int flags)
 {
-	if (IS_PRIVATE(d_inode(dentry)))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 31837f031f59..f15a5f9e84ce 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
-	    const char *name, const void *buffer, size_t size, int flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+	    struct inode *inode, const char *name, const void *buffer,
+	    size_t size, int flags)
 {
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index f7c39731684b..dc59df43b2db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-user_set(const struct xattr_handler *handler, struct dentry *dentry,
-	 const char *name, const void *buffer, size_t size, int flags)
+user_set(const struct xattr_handler *handler, struct dentry *unused,
+	 struct inode *inode, const char *name, const void *buffer,
+	 size_t size, int flags)
 {
-	if (!reiserfs_xattrs_user(dentry->d_sb))
+	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 6c277eb6aef9..b5fc27969e9d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ubifs_xattr_set(const struct xattr_handler *handler,
-			   struct dentry *dentry, const char *name,
-			   const void *value, size_t size, int flags)
+			   struct dentry *dentry, struct inode *inode,
+			   const char *name, const void *value,
+			   size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
 		name, inode->i_ino, dentry, size);
 
diff --git a/fs/xattr.c b/fs/xattr.c
index fc81e771488a..b16d07889700 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -754,7 +754,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, name, value, size, flags);
+	return handler->set(handler, dentry, d_inode(dentry), name, value,
+			    size, flags);
 }
 
 /*
@@ -769,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
+	return handler->set(handler, dentry, d_inode(dentry), name, NULL,
+			    0, XATTR_REPLACE);
 }
 
 EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index d111f691f313..2773b155cb56 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -74,11 +74,12 @@ xfs_forget_acl(
 }
 
 static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
-		const char *name, const void *value, size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
+		struct inode *inode, const char *name, const void *value,
+		size_t size, int flags)
 {
 	int			xflags = handler->flags;
-	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
 
 	/* Convert Linux syscall to XFS internal ATTR flags */
@@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
 	error = xfs_attr_set(ip, (unsigned char *)name,
 				(void *)value, size, xflags);
 	if (!error)
-		xfs_forget_acl(d_inode(dentry), name, xflags);
+		xfs_forget_acl(inode, name, xflags);
 
 	return error;
 }
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 1cc4c578deb9..76beb206741a 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -33,8 +33,8 @@ struct xattr_handler {
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
 	int (*set)(const struct xattr_handler *, struct dentry *dentry,
-		   const char *name, const void *buffer, size_t size,
-		   int flags);
+		   struct inode *inode, const char *name, const void *buffer,
+		   size_t size, int flags);
 };
 
 const char *xattr_full_name(const struct xattr_handler *, const char *);
diff --git a/mm/shmem.c b/mm/shmem.c
index e418a995427d..a36144909b28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2645,10 +2645,11 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *value, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
 {
-	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	name = xattr_full_name(handler, name);
 	return simple_xattr_set(&info->xattrs, name, value, size, flags);
-- 
cgit v1.2.3


From 7ded384a12688c2a86b618da16bc87713404dfcc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 15:23:32 -0700
Subject: mm: fix section mismatch warning

The register_page_bootmem_info_node() function needs to be marked __init
in order to avoid a new warning introduced by commit f65e91df25aa ("mm:
use early_pfn_to_nid in register_page_bootmem_info_node").

Otherwise you'll get a warning about how a non-init function calls
early_pfn_to_nid (which is __meminit)

Cc: Yang Shi <yang.shi@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d8a5d4d133..5145620ba48a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -182,7 +182,7 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
+extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
 #else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b8ee0806415f..e3cbdcaff2a5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -263,7 +263,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
-void register_page_bootmem_info_node(struct pglist_data *pgdat)
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
 	unsigned long i, pfn, end_pfn, nr_pages;
 	int node = pgdat->node_id;
-- 
cgit v1.2.3


From 5d22fc25d4fc8096d2d7df27ea1893d4e055e764 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 15:57:31 -0700
Subject: mm: remove more IS_ERR_VALUE abuses

The do_brk() and vm_brk() return value was "unsigned long" and returned
the starting address on success, and an error value on failure.  The
reasons are entirely historical, and go back to it basically behaving
like the mmap() interface does.

However, nobody actually wanted that interface, and it causes totally
pointless IS_ERR_VALUE() confusion.

What every single caller actually wants is just the simpler integer
return of zero for success and negative error number on failure.

So just convert to that much clearer and more common calling convention,
and get rid of all the IS_ERR_VALUE() uses wrt vm_brk().

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c | 17 ++++++++---------
 fs/binfmt_aout.c          | 18 +++++++-----------
 fs/binfmt_elf.c           | 11 +++++------
 include/linux/mm.h        |  2 +-
 mm/mmap.c                 | 16 ++++++++--------
 mm/nommu.c                |  2 +-
 6 files changed, 30 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f5e737ff0022..cb26f18d43af 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -116,12 +116,12 @@ static struct linux_binfmt aout_format = {
 	.min_coredump	= PAGE_SIZE
 };
 
-static unsigned long set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end)
 {
 	start = PAGE_ALIGN(start);
 	end = PAGE_ALIGN(end);
 	if (end <= start)
-		return start;
+		return 0;
 	return vm_brk(start, end - start);
 }
 
@@ -321,7 +321,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
 
-		if (error != (text_addr & PAGE_MASK))
+		if (error)
 			return error;
 
 		error = read_code(bprm->file, text_addr, 32,
@@ -350,7 +350,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 		if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
 			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (IS_ERR_VALUE(error))
+			if (error)
 				return error;
 
 			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -378,7 +378,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 beyond_if:
 	error = set_brk(current->mm->start_brk, current->mm->brk);
-	if (IS_ERR_VALUE(error))
+	if (error)
 		return error;
 
 	set_binfmt(&aout_format);
@@ -441,7 +441,7 @@ static int load_aout_library(struct file *file)
 		}
 #endif
 		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (IS_ERR_VALUE(retval))
+		if (retval)
 			goto out;
 
 		read_code(file, start_addr, N_TXTOFF(ex),
@@ -461,9 +461,8 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		error = vm_brk(start_addr + len, bss - len);
-		retval = error;
-		if (error != start_addr + len)
+		retval = vm_brk(start_addr + len, bss - len);
+		if (retval)
 			goto out;
 	}
 	retval = 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2fab9f130e51..64b331ae3428 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -127,11 +127,8 @@ static int set_brk(unsigned long start, unsigned long end)
 {
 	start = PAGE_ALIGN(start);
 	end = PAGE_ALIGN(end);
-	if (end > start) {
-		unsigned long addr;
-		addr = vm_brk(start, end - start);
-		if (BAD_ADDR(addr))
-			return addr;
+	if (end > start)
+		return vm_brk(start, end - start);
 	}
 	return 0;
 }
@@ -275,7 +272,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		map_size = ex.a_text+ex.a_data;
 #endif
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
-		if (error != (text_addr & PAGE_MASK))
+		if (error)
 			return error;
 
 		error = read_code(bprm->file, text_addr, pos,
@@ -298,7 +295,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
 			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (IS_ERR_VALUE(error))
+			if (error)
 				return error;
 
 			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -382,7 +379,7 @@ static int load_aout_library(struct file *file)
 			       file);
 		}
 		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (IS_ERR_VALUE(retval))
+		if (retval)
 			goto out;
 
 		read_code(file, start_addr, N_TXTOFF(ex),
@@ -402,9 +399,8 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		error = vm_brk(start_addr + len, bss - len);
-		retval = error;
-		if (error != start_addr + len)
+		retval = vm_brk(start_addr + len, bss - len);
+		if (retval)
 			goto out;
 	}
 	retval = 0;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 938fc4ede764..e158b22ef32f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end)
 	start = ELF_PAGEALIGN(start);
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
-		unsigned long addr;
-		addr = vm_brk(start, end - start);
-		if (BAD_ADDR(addr))
-			return addr;
+		int error = vm_brk(start, end - start);
+		if (error)
+			return error;
 	}
 	current->mm->start_brk = current->mm->brk = end;
 	return 0;
@@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
 		/* Map the last of the bss segment */
 		error = vm_brk(elf_bss, last_bss - elf_bss);
-		if (BAD_ADDR(error))
+		if (error)
 			goto out;
 	}
 
@@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file)
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
 	if (bss > len) {
 		error = vm_brk(len, bss - len);
-		if (BAD_ADDR(error))
+		if (error)
 			goto out_free_ph;
 	}
 	error = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a00ec816233a..5df5feb49575 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2018,7 +2018,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
+extern int __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index d3d9a94ca031..de2c1769cc68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -168,7 +168,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static unsigned long do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -224,7 +224,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+	if (do_brk(oldbrk, newbrk-oldbrk) < 0)
 		goto out;
 
 set_brk:
@@ -2625,7 +2625,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static unsigned long do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -2636,7 +2636,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	len = PAGE_ALIGN(len);
 	if (!len)
-		return addr;
+		return 0;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
@@ -2703,13 +2703,13 @@ out:
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
-	return addr;
+	return 0;
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long ret;
+	int ret;
 	bool populate;
 
 	if (down_write_killable(&mm->mmap_sem))
@@ -2718,7 +2718,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
-	if (populate)
+	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index c8bd59a03c71..c2e58880207f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1682,7 +1682,7 @@ void exit_mmap(struct mm_struct *mm)
 	}
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From aa00edc1287a693eadc7bc67a3d73555d969b35d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 16:03:22 -0700
Subject: make IS_ERR_VALUE() complain about non-pointer-sized arguments

Now that the allmodconfig x86-64 build is clean wrt IS_ERR_VALUE() uses
on integers, add a cast to a pointer and back to the argument, so that
any new mis-uses of IS_ERR_VALUE() will cause warnings like

   warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]

so that we don't re-introduce any bogus uses.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/err.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/err.h b/include/linux/err.h
index 56762ab41713..1e3558845e4c 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -18,7 +18,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)
 
 static inline void * __must_check ERR_PTR(long error)
 {
-- 
cgit v1.2.3


From 3767e255b390d72f9a33c08d9e86c5f21f25860f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2016 11:06:05 -0400
Subject: switch ->setxattr() to passing dentry and inode separately

smack ->d_instantiate() uses ->setxattr(), so to be able to call it before
we'd hashed the new dentry and attached it to inode, we need ->setxattr()
instances getting the inode as an explicit argument rather than obtaining
it from dentry.

Similar change for ->getxattr() had been done in commit ce23e64.  Unlike
->getxattr() (which is used by both selinux and smack instances of
->d_instantiate()) ->setxattr() is used only by smack one and unfortunately
it got missed back then.

Reported-by: Seung-Woo Kim <sw0312.kim@samsung.com>
Tested-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting                    |  7 +++++++
 drivers/staging/lustre/lustre/llite/llite_internal.h |  4 ++--
 drivers/staging/lustre/lustre/llite/xattr.c          |  6 ++----
 fs/bad_inode.c                                       |  4 ++--
 fs/ecryptfs/crypto.c                                 |  9 +++++----
 fs/ecryptfs/ecryptfs_kernel.h                        |  4 ++--
 fs/ecryptfs/inode.c                                  |  7 ++++---
 fs/ecryptfs/mmap.c                                   |  3 ++-
 fs/fuse/dir.c                                        |  6 +++---
 fs/hfs/attr.c                                        |  6 +++---
 fs/hfs/hfs_fs.h                                      |  2 +-
 fs/kernfs/inode.c                                    | 11 ++++++-----
 fs/kernfs/kernfs-internal.h                          |  3 ++-
 fs/libfs.c                                           |  5 +++--
 fs/overlayfs/inode.c                                 |  5 +++--
 fs/overlayfs/overlayfs.h                             |  5 +++--
 fs/xattr.c                                           |  8 ++++----
 include/linux/fs.h                                   |  3 ++-
 include/linux/xattr.h                                |  3 ++-
 security/smack/smack_lsm.c                           |  2 +-
 20 files changed, 59 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 46f3bb7a02f5..a5fb89cac615 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -578,3 +578,10 @@ in your dentry operations instead.
 --
 [mandatory]
 	->atomic_open() calls without O_CREAT may happen in parallel.
+--
+[mandatory]
+	->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
+	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
+	called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
+	->d_instantiate() uses not just ->getxattr() but ->setxattr() as well.
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index ce1f949430f1..3f2f30b6542c 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -976,8 +976,8 @@ static inline __u64 ll_file_maxbytes(struct inode *inode)
 }
 
 /* llite/xattr.c */
-int ll_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags);
+int ll_setxattr(struct dentry *dentry, struct inode *inode,
+		const char *name, const void *value, size_t size, int flags);
 ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode,
 		    const char *name, void *buffer, size_t size);
 ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
index ed4de04381c3..608014b0dbcd 100644
--- a/drivers/staging/lustre/lustre/llite/xattr.c
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -211,11 +211,9 @@ int ll_setxattr_common(struct inode *inode, const char *name,
 	return 0;
 }
 
-int ll_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags)
+int ll_setxattr(struct dentry *dentry, struct inode *inode,
+		const char *name, const void *value, size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	LASSERT(inode);
 	LASSERT(name);
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 72e35b721608..3ba385eaa26e 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
 	return -EIO;
 }
 
-static int bad_inode_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags)
+static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode,
+		const char *name, const void *value, size_t size, int flags)
 {
 	return -EIO;
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ebd40f46ed4c..0d8eb3455b34 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
 
 static int
 ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+				 struct inode *ecryptfs_inode,
 				 char *page_virt, size_t size)
 {
 	int rc;
 
-	rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
-			       size, 0);
+	rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode,
+			       ECRYPTFS_XATTR_NAME, page_virt, size, 0);
 	return rc;
 }
 
@@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
 		goto out_free;
 	}
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
-						      size);
+		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode,
+						      virt, size);
 	else
 		rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
 							 virt_len);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3ec495db7e82..4ba1547bb9ad 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -609,8 +609,8 @@ ssize_t
 ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
 			const char *name, void *value, size_t size);
 int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		  size_t size, int flags);
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+		  const void *value, size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
 #ifdef CONFIG_ECRYPT_FS_MESSAGING
 int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 318b04689d76..9d153b6a1d72 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 }
 
 int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
+		  const char *name, const void *value,
 		  size_t size, int flags)
 {
 	int rc = 0;
@@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	}
 
 	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-	if (!rc && d_really_is_positive(dentry))
-		fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+	if (!rc && inode)
+		fsstack_copy_attr_all(inode, d_inode(lower_dentry));
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 148d11b514fb..9c3437c8a5b1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	if (size < 0)
 		size = 8;
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
-	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+	rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode,
+					 ECRYPTFS_XATTR_NAME,
 					 xattr_virt, size, 0);
 	inode_unlock(lower_inode);
 	if (rc)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b9419058108f..ccd4971cc6c1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1719,10 +1719,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 	return fuse_update_attributes(inode, stat, NULL, NULL);
 }
 
-static int fuse_setxattr(struct dentry *entry, const char *name,
-			 const void *value, size_t size, int flags)
+static int fuse_setxattr(struct dentry *unused, struct inode *inode,
+			 const char *name, const void *value,
+			 size_t size, int flags)
 {
-	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	FUSE_ARGS(args);
 	struct fuse_setxattr_in inarg;
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 064f92f17efc..d9a86919fdf6 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -13,10 +13,10 @@
 #include "hfs_fs.h"
 #include "btree.h"
 
-int hfs_setxattr(struct dentry *dentry, const char *name,
-		 const void *value, size_t size, int flags)
+int hfs_setxattr(struct dentry *unused, struct inode *inode,
+		 const char *name, const void *value,
+		 size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
 	struct hfs_find_data fd;
 	hfs_cat_rec rec;
 	struct hfs_cat_file *file;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fa3eed86837c..ee2f385811c8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
-extern int hfs_setxattr(struct dentry *dentry, const char *name,
+extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
 extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
 			    const char *name, void *value, size_t size);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 1719649d7ad7..63b925d5ba1e 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -160,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
 	return 0;
 }
 
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_iattrs *attrs;
 	void *secdata;
 	int error;
@@ -175,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		error = security_inode_setsecurity(d_inode(dentry), suffix,
+		error = security_inode_setsecurity(inode, suffix,
 						value, size, flags);
 		if (error)
 			return error;
-		error = security_inode_getsecctx(d_inode(dentry),
+		error = security_inode_getsecctx(inode,
 						&secdata, &secdata_len);
 		if (error)
 			return error;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 45c9192c276e..37159235ac10 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
 int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode,
+			const char *name, const void *value,
 			size_t size, int flags);
 int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
diff --git a/fs/libfs.c b/fs/libfs.c
index 8765ff1adc07..3db2721144c2 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1118,8 +1118,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
 	return -EPERM;
 }
 
-static int empty_dir_setxattr(struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags)
+static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode,
+			      const char *name, const void *value,
+			      size_t size, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c7b31a03dc9c..0ed7c4012437 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -210,8 +210,9 @@ static bool ovl_is_private_xattr(const char *name)
 	return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
 }
 
-int ovl_setxattr(struct dentry *dentry, const char *name,
-		 const void *value, size_t size, int flags)
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+		 const char *name, const void *value,
+		 size_t size, int flags)
 {
 	int err;
 	struct dentry *upperdentry;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 99ec4b035237..d79577eb3937 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -171,8 +171,9 @@ int ovl_check_d_type_supported(struct path *realpath);
 /* inode.c */
 int ovl_setattr(struct dentry *dentry, struct iattr *attr);
 int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, const char *name,
-		 const void *value, size_t size, int flags);
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+		 const char *name, const void *value,
+		 size_t size, int flags);
 ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
 		     const char *name, void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
diff --git a/fs/xattr.c b/fs/xattr.c
index b16d07889700..4beafc43daa5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 	if (issec)
 		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_op->setxattr) {
-		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		error = inode->i_op->setxattr(dentry, inode, name, value, size, flags);
 		if (!error) {
 			fsnotify_xattr(dentry);
 			security_inode_post_setxattr(dentry, name, value,
@@ -745,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
  * Find the handler for the prefix and dispatch its set() operation.
  */
 int
-generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+		 const void *value, size_t size, int flags)
 {
 	const struct xattr_handler *handler;
 
@@ -754,8 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, d_inode(dentry), name, value,
-			    size, flags);
+	return handler->set(handler, dentry, inode, name, value, size, flags);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f61431d8673..62bdb0a6cf2d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1730,7 +1730,8 @@ struct inode_operations {
 			struct inode *, struct dentry *, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
-	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+	int (*setxattr) (struct dentry *, struct inode *,
+			 const char *, const void *, size_t, int);
 	ssize_t (*getxattr) (struct dentry *, struct inode *,
 			     const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 76beb206741a..94079bab9243 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -54,7 +54,8 @@ int vfs_removexattr(struct dentry *, const char *);
 
 ssize_t generic_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size);
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
-int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags);
+int generic_setxattr(struct dentry *dentry, struct inode *inode,
+		     const char *name, const void *value, size_t size, int flags);
 int generic_removexattr(struct dentry *dentry, const char *name);
 ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
 			   char **xattr_value, size_t size, gfp_t flags);
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index ff2b8c3cf7a9..6777295f4b2b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3514,7 +3514,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 			 */
 			if (isp->smk_flags & SMK_INODE_CHANGED) {
 				isp->smk_flags &= ~SMK_INODE_CHANGED;
-				rc = inode->i_op->setxattr(dp,
+				rc = inode->i_op->setxattr(dp, inode,
 					XATTR_NAME_SMACKTRANSMUTE,
 					TRANS_TRUE, TRANS_TRUE_SIZE,
 					0);
-- 
cgit v1.2.3


From f4bcbe792b8f434e32487cff9d9e30ab45a3ce02 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Fri, 20 May 2016 07:26:00 -0400
Subject: Pull out string hash to <linux/stringhash.h>

... so they can be used without the rest of <linux/dcache.h>

The hashlen_* macros will make sense next patch.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
---
 include/linux/dcache.h     | 27 +----------------
 include/linux/stringhash.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 26 deletions(-)
 create mode 100644 include/linux/stringhash.h

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 7e9422cb5989..0f9a977c334f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -10,6 +10,7 @@
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
 #include <linux/lockref.h>
+#include <linux/stringhash.h>
 
 struct path;
 struct vfsmount;
@@ -52,9 +53,6 @@ struct qstr {
 };
 
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
-#define hashlen_hash(hashlen) ((u32) (hashlen))
-#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
-#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash))
 
 struct dentry_stat_t {
 	long nr_dentry;
@@ -65,29 +63,6 @@ struct dentry_stat_t {
 };
 extern struct dentry_stat_t dentry_stat;
 
-/* Name hashing routines. Initial hash value */
-/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
-#define init_name_hash()		0
-
-/* partial hash update function. Assume roughly 4 bits per character */
-static inline unsigned long
-partial_name_hash(unsigned long c, unsigned long prevhash)
-{
-	return (prevhash + (c << 4) + (c >> 4)) * 11;
-}
-
-/*
- * Finally: cut down the number of bits to a int value (and try to avoid
- * losing bits)
- */
-static inline unsigned long end_name_hash(unsigned long hash)
-{
-	return (unsigned int) hash;
-}
-
-/* Compute the hash for a name string. */
-extern unsigned int full_name_hash(const unsigned char *, unsigned int);
-
 /*
  * Try to keep struct dentry aligned on 64 byte cachelines (this will
  * give reasonable cacheline footprint with larger lines without the
diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h
new file mode 100644
index 000000000000..2eaaaf6d2776
--- /dev/null
+++ b/include/linux/stringhash.h
@@ -0,0 +1,72 @@
+#ifndef __LINUX_STRINGHASH_H
+#define __LINUX_STRINGHASH_H
+
+#include <linux/types.h>
+
+/*
+ * Routines for hashing strings of bytes to a 32-bit hash value.
+ *
+ * These hash functions are NOT GUARANTEED STABLE between kernel
+ * versions, architectures, or even repeated boots of the same kernel.
+ * (E.g. they may depend on boot-time hardware detection or be
+ * deliberately randomized.)
+ *
+ * They are also not intended to be secure against collisions caused by
+ * malicious inputs; much slower hash functions are required for that.
+ *
+ * They are optimized for pathname components, meaning short strings.
+ * Even if a majority of files have longer names, the dynamic profile of
+ * pathname components skews short due to short directory names.
+ * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
+ */
+
+/*
+ * Version 1: one byte at a time.  Example of use:
+ *
+ * unsigned long hash = init_name_hash;
+ * while (*p)
+ *	hash = partial_name_hash(tolower(*p++), hash);
+ * hash = end_name_hash(hash);
+ *
+ * Although this is designed for bytes, fs/hfsplus/unicode.c
+ * abuses it to hash 16-bit values.
+ */
+
+/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
+#define init_name_hash()		0
+
+/* partial hash update function. Assume roughly 4 bits per character */
+static inline unsigned long
+partial_name_hash(unsigned long c, unsigned long prevhash)
+{
+	return (prevhash + (c << 4) + (c >> 4)) * 11;
+}
+
+/*
+ * Finally: cut down the number of bits to a int value (and try to avoid
+ * losing bits)
+ */
+static inline unsigned long end_name_hash(unsigned long hash)
+{
+	return (unsigned int)hash;
+}
+
+/*
+ * Version 2: One word (32 or 64 bits) at a time.
+ * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
+ * exists, which describes major Linux platforms like x86 and ARM), then
+ * this computes a different hash function much faster.
+ *
+ * If not set, this falls back to a wrapper around the preceding.
+ */
+extern unsigned int full_name_hash(const unsigned char *, unsigned int);
+
+/*
+ * A hash_len is a u64 with the hash of a string in the low
+ * half and the length in the high half.
+ */
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+
+#endif	/* __LINUX_STRINGHASH_H */
-- 
cgit v1.2.3


From fcfd2fbf22d2587196890103d41e3d554c47da0e Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Fri, 20 May 2016 08:41:37 -0400
Subject: fs/namei.c: Add hashlen_string() function

We'd like to make more use of the highly-optimized dcache hash functions
throughout the kernel, rather than have every subsystem create its own,
and a function that hashes basic null-terminated strings is required
for that.

(The name is to emphasize that it returns both hash and length.)

It's actually useful in the dcache itself, specifically d_alloc_name().
Other uses in the next patch.

full_name_hash() is also tweaked to make it more generally useful:
1) Take a "char *" rather than "unsigned char *" argument, to
   be consistent with hash_name().
2) Handle zero-length inputs.  If we want more callers, we don't want
   to make them worry about corner cases.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
---
 fs/dcache.c                |  3 +--
 fs/namei.c                 | 51 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/stringhash.h |  8 ++++++--
 3 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index d5ecc6e477da..19b751806789 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1653,8 +1653,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 	struct qstr q;
 
 	q.name = name;
-	q.len = strlen(name);
-	q.hash = full_name_hash(q.name, q.len);
+	q.hash_len = hashlen_string(name);
 	return d_alloc(parent, &q);
 }
 EXPORT_SYMBOL(d_alloc_name);
diff --git a/fs/namei.c b/fs/namei.c
index 42f8ca038254..dd98d43a54f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1822,19 +1822,20 @@ static inline unsigned long mix_hash(unsigned long hash)
 
 #endif
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const char *name, unsigned int len)
 {
 	unsigned long a, hash = 0;
 
 	for (;;) {
+		if (!len)
+			goto done;
 		a = load_unaligned_zeropad(name);
 		if (len < sizeof(unsigned long))
 			break;
 		hash = mix_hash(hash + a);
 		name += sizeof(unsigned long);
 		len -= sizeof(unsigned long);
-		if (!len)
-			goto done;
 	}
 	hash += a & bytemask_from_count(len);
 done:
@@ -1842,6 +1843,29 @@ done:
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const char *name)
+{
+	unsigned long a, adata, mask, hash, len;
+	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = mix_hash(hash + a);
+		len += sizeof(unsigned long);
+		a = load_unaligned_zeropad(name+len);
+	} while (!has_zero(a, &adata, &constants));
+
+	adata = prep_zero_mask(a, adata, &constants);
+	mask = create_zero_mask(adata);
+	hash += a & zero_bytemask(mask);
+	len += find_zero(mask);
+
+	return hashlen_create(fold_hash(hash), len);
+}
+EXPORT_SYMBOL(hashlen_string);
+
 /*
  * Calculate the length and hash of the path component, and
  * return the "hash_len" as the result.
@@ -1872,15 +1896,32 @@ static inline u64 hash_name(const char *name)
 
 #else
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
 	while (len--)
-		hash = partial_name_hash(*name++, hash);
+		hash = partial_name_hash((unsigned char)*name++, hash);
 	return end_name_hash(hash);
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hash_string(const char *name)
+{
+	unsigned long hash = init_name_hash();
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	do {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	} while (c);
+	return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(hash_string);
+
 /*
  * We know there's a real path component here of at least
  * one character.
diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h
index 2eaaaf6d2776..451771d9b9c0 100644
--- a/include/linux/stringhash.h
+++ b/include/linux/stringhash.h
@@ -1,7 +1,8 @@
 #ifndef __LINUX_STRINGHASH_H
 #define __LINUX_STRINGHASH_H
 
-#include <linux/types.h>
+#include <linux/compiler.h>	/* For __pure */
+#include <linux/types.h>	/* For u32, u64 */
 
 /*
  * Routines for hashing strings of bytes to a 32-bit hash value.
@@ -59,7 +60,7 @@ static inline unsigned long end_name_hash(unsigned long hash)
  *
  * If not set, this falls back to a wrapper around the preceding.
  */
-extern unsigned int full_name_hash(const unsigned char *, unsigned int);
+extern unsigned int __pure full_name_hash(const char *, unsigned int);
 
 /*
  * A hash_len is a u64 with the hash of a string in the low
@@ -69,4 +70,7 @@ extern unsigned int full_name_hash(const unsigned char *, unsigned int);
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
 #define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+extern u64 __pure hashlen_string(const char *name);
+
 #endif	/* __LINUX_STRINGHASH_H */
-- 
cgit v1.2.3


From 917ea166f4672ec085f2cccc135c7c0eec72282c Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Fri, 20 May 2016 13:31:33 -0400
Subject: <linux/sunrpc/svcauth.h>: Define hash_str() in terms of
 hashlen_string()

Finally, the first use of previous two patches: eliminate the
separate ad-hoc string hash functions in the sunrpc code.

Now hash_str() is a wrapper around hash_string(), and hash_mem() is
likewise a wrapper around full_name_hash().

Note that sunrpc code *does* call hash_mem() with a zero length, which
is why the previous patch needed to handle that in full_name_hash().
(Thanks, Bruce, for finding that!)

This also eliminates the only caller of hash_long which asks for
more than 32 bits of output.

The comment about the quality of hashlen_string() and full_name_hash()
is jumping the gun by a few patches; they aren't very impressive now,
but will be improved greatly later in the series.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: J. Bruce Fields <bfields@redhat.com>
Cc: Jeff Layton <jlayton@poochiereds.net>
Cc: linux-nfs@vger.kernel.org
---
 include/linux/sunrpc/svcauth.h | 40 +++++++++-------------------------------
 1 file changed, 9 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index c00f53a4ccdd..91d5a5d6f52b 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -16,6 +16,7 @@
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/hash.h>
+#include <linux/stringhash.h>
 #include <linux/cred.h>
 
 struct svc_cred {
@@ -165,41 +166,18 @@ extern int svcauth_unix_set_client(struct svc_rqst *rqstp);
 extern int unix_gid_cache_create(struct net *net);
 extern void unix_gid_cache_destroy(struct net *net);
 
-static inline unsigned long hash_str(char *name, int bits)
+/*
+ * The <stringhash.h> functions are good enough that we don't need to
+ * use hash_32() on them; just extracting the high bits is enough.
+ */
+static inline unsigned long hash_str(char const *name, int bits)
 {
-	unsigned long hash = 0;
-	unsigned long l = 0;
-	int len = 0;
-	unsigned char c;
-	do {
-		if (unlikely(!(c = *name++))) {
-			c = (char)len; len = -1;
-		}
-		l = (l << 8) | c;
-		len++;
-		if ((len & (BITS_PER_LONG/8-1))==0)
-			hash = hash_long(hash^l, BITS_PER_LONG);
-	} while (len);
-	return hash >> (BITS_PER_LONG - bits);
+	return hashlen_hash(hashlen_string(name)) >> (32 - bits);
 }
 
-static inline unsigned long hash_mem(char *buf, int length, int bits)
+static inline unsigned long hash_mem(char const *buf, int length, int bits)
 {
-	unsigned long hash = 0;
-	unsigned long l = 0;
-	int len = 0;
-	unsigned char c;
-	do {
-		if (len == length) {
-			c = (char)len; len = -1;
-		} else
-			c = *buf++;
-		l = (l << 8) | c;
-		len++;
-		if ((len & (BITS_PER_LONG/8-1))==0)
-			hash = hash_long(hash^l, BITS_PER_LONG);
-	} while (len);
-	return hash >> (BITS_PER_LONG - bits);
+	return full_name_hash(buf, length) >> (32 - bits);
 }
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 92d567740f2ab5937b2c23bee94ea4b284bb1f98 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Thu, 26 May 2016 22:22:01 -0400
Subject: Change hash_64() return value to 32 bits

That's all that's ever asked for, and it makes the return
type of hash_long() consistent.

It also allows (upcoming patch) an optimized implementation
of hash_64 on 32-bit machines.

I tried adding a BUILD_BUG_ON to ensure the number of bits requested
was never more than 32 (most callers use a compile-time constant), but
adding <linux/bug.h> to <linux/hash.h> breaks the tools/perf compiler
unless tools/perf/MANIFEST is updated, and understanding that code base
well enough to update it is too much trouble.  I did the rest of an
allyesconfig build with such a check, and nothing tripped.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
---
 include/linux/hash.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hash.h b/include/linux/hash.h
index 79c52fa81cac..f967dedb10e2 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -48,7 +48,7 @@
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static __always_inline u64 hash_64(u64 val, unsigned int bits)
+static __always_inline u32 hash_64(u64 val, unsigned int bits)
 {
 	u64 hash = val;
 
@@ -72,7 +72,7 @@ static __always_inline u64 hash_64(u64 val, unsigned int bits)
 #endif
 
 	/* High bits are more random, so use them. */
-	return hash >> (64 - bits);
+	return (u32)(hash >> (64 - bits));
 }
 
 static inline u32 hash_32(u32 val, unsigned int bits)
@@ -84,7 +84,7 @@ static inline u32 hash_32(u32 val, unsigned int bits)
 	return hash >> (32 - bits);
 }
 
-static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+static inline u32 hash_ptr(const void *ptr, unsigned int bits)
 {
 	return hash_long((unsigned long)ptr, bits);
 }
-- 
cgit v1.2.3


From ef703f49a6c5b909a85149bb6625c4ed0d697186 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Thu, 26 May 2016 23:00:23 -0400
Subject: Eliminate bad hash multipliers from hash_32() and  hash_64()

The "simplified" prime multipliers made very bad hash functions, so get rid
of them.  This completes the work of 689de1d6ca.

To avoid the inefficiency which was the motivation for the "simplified"
multipliers, hash_64() on 32-bit systems is changed to use a different
algorithm.  It makes two calls to hash_32() instead.

drivers/media/usb/dvb-usb-v2/af9015.c uses the old GOLDEN_RATIO_PRIME_32
for some horrible reason, so it inherits a copy of the old definition.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Mauro Carvalho Chehab <m.chehab@samsung.com>
---
 drivers/media/usb/dvb-usb-v2/af9015.c |  2 +
 include/linux/hash.h                  | 87 ++++++++++++++---------------------
 2 files changed, 36 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/usb/dvb-usb-v2/af9015.c b/drivers/media/usb/dvb-usb-v2/af9015.c
index 95a7388e89d4..09e0f58f6bb7 100644
--- a/drivers/media/usb/dvb-usb-v2/af9015.c
+++ b/drivers/media/usb/dvb-usb-v2/af9015.c
@@ -398,6 +398,8 @@ error:
 }
 
 #define AF9015_EEPROM_SIZE 256
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
 
 /* hash (and dump) eeprom */
 static int af9015_eeprom_hash(struct dvb_usb_device *d)
diff --git a/include/linux/hash.h b/include/linux/hash.h
index f967dedb10e2..613cfde3a1e0 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -3,85 +3,65 @@
 /* Fast hashing routine for ints,  longs and pointers.
    (C) 2002 Nadia Yvette Chambers, IBM */
 
-/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
 #include <asm/types.h>
 #include <linux/compiler.h>
 
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
-
+/*
+ * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
+ * fs/inode.c.  It's not actually prime any more (the previous primes
+ * were actively bad for hashing), but the name remains.
+ */
 #if BITS_PER_LONG == 32
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
 #define hash_long(val, bits) hash_32(val, bits)
 #elif BITS_PER_LONG == 64
 #define hash_long(val, bits) hash_64(val, bits)
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
 #else
 #error Wordsize not 32 or 64
 #endif
 
 /*
- * The above primes are actively bad for hashing, since they are
- * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
- * real problems. Besides, the "prime" part is pointless for the
- * multiplicative hash.
+ * This hash multiplies the input by a large odd number and takes the
+ * high bits.  Since multiplication propagates changes to the most
+ * significant end only, it is essential that the high bits of the
+ * product be used for the hash value.
+ *
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
  *
  * Although a random odd number will do, it turns out that the golden
  * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
- * properties.
+ * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
  *
- * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
- * (See Knuth vol 3, section 6.4, exercise 9.)
+ * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
+ * which is very slightly easier to multiply by and makes no
+ * difference to the hash distribution.
  */
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static __always_inline u32 hash_64(u64 val, unsigned int bits)
-{
-	u64 hash = val;
-
-#if BITS_PER_LONG == 64
-	hash = hash * GOLDEN_RATIO_64;
-#else
-	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	u64 n = hash;
-	n <<= 18;
-	hash -= n;
-	n <<= 33;
-	hash -= n;
-	n <<= 3;
-	hash += n;
-	n <<= 3;
-	hash -= n;
-	n <<= 4;
-	hash += n;
-	n <<= 2;
-	hash += n;
-#endif
 
-	/* High bits are more random, so use them. */
-	return (u32)(hash >> (64 - bits));
+static inline u32 __hash_32(u32 val)
+{
+	return val * GOLDEN_RATIO_32;
 }
 
 static inline u32 hash_32(u32 val, unsigned int bits)
 {
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	u32 hash = val * GOLDEN_RATIO_PRIME_32;
-
 	/* High bits are more random, so use them. */
-	return hash >> (32 - bits);
+	return __hash_32(val) >> (32 - bits);
+}
+
+static __always_inline u32 hash_64(u64 val, unsigned int bits)
+{
+#if BITS_PER_LONG == 64
+	/* 64x64-bit multiply is efficient on all 64-bit processors */
+	return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+	/* Hash 64 bits using only 32x32-bit multiply. */
+	return hash_32((u32)val ^ __hash_32(val >> 32), bits);
+#endif
 }
 
 static inline u32 hash_ptr(const void *ptr, unsigned int bits)
@@ -89,6 +69,7 @@ static inline u32 hash_ptr(const void *ptr, unsigned int bits)
 	return hash_long((unsigned long)ptr, bits);
 }
 
+/* This really should be called fold32_ptr; it does no hashing to speak of. */
 static inline u32 hash32_ptr(const void *ptr)
 {
 	unsigned long val = (unsigned long)ptr;
-- 
cgit v1.2.3


From 468a9428521e7d00fb21250af363eb94dc1d6861 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Thu, 26 May 2016 22:11:51 -0400
Subject: <linux/hash.h>: Add support for architecture-specific functions

This is just the infrastructure; there are no users yet.

This is modelled on CONFIG_ARCH_RANDOM; a CONFIG_ symbol declares
the existence of <asm/hash.h>.

That file may define its own versions of various functions, and define
HAVE_* symbols (no CONFIG_ prefix!) to suppress the generic ones.

Included is a self-test (in lib/test_hash.c) that verifies the basics.
It is NOT in general required that the arch-specific functions compute
the same thing as the generic, but if a HAVE_* symbol is defined with
the value 1, then equality is tested.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Andreas Schwab <schwab@linux-m68k.org>
Cc: Philippe De Muyter <phdm@macq.eu>
Cc: linux-m68k@lists.linux-m68k.org
Cc: Alistair Francis <alistai@xilinx.com>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: uclinux-h8-devel@lists.sourceforge.jp
---
 arch/Kconfig         |   8 ++
 fs/namei.c           |   6 +-
 include/linux/hash.h |  27 +++++-
 lib/Kconfig.debug    |  11 +++
 lib/Makefile         |   1 +
 lib/test_hash.c      | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 299 insertions(+), 4 deletions(-)
 create mode 100644 lib/test_hash.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5e7e17..96406e4db995 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -589,6 +589,14 @@ config HAVE_STACK_VALIDATION
 	  Architecture supports the 'objtool check' host tool command, which
 	  performs compile-time stack metadata validation.
 
+config HAVE_ARCH_HASH
+	bool
+	default n
+	help
+	  If this is set, the architecture provides an <asm/hash.h>
+	  file which provides platform-specific implementations of some
+	  functions in <linux/hash.h> or fs/namei.c.
+
 #
 # ABI hall of shame
 #
diff --git a/fs/namei.c b/fs/namei.c
index a49cbd7efcaa..968dae025230 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1788,7 +1788,11 @@ static int walk_component(struct nameidata *nd, int flags)
 
 #include <asm/word-at-a-time.h>
 
-#ifdef CONFIG_64BIT
+#ifdef HASH_MIX
+
+/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
+
+#elif defined(CONFIG_64BIT)
 /*
  * Register pressure in the mixing function is an issue, particularly
  * on 32-bit x86, but almost any function requires one state value and
diff --git a/include/linux/hash.h b/include/linux/hash.h
index 613cfde3a1e0..ad6fa21d977b 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -41,19 +41,40 @@
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
+#ifdef CONFIG_HAVE_ARCH_HASH
+/* This header may use the GOLDEN_RATIO_xx constants */
+#include <asm/hash.h>
+#endif
 
-static inline u32 __hash_32(u32 val)
+/*
+ * The _generic versions exist only so lib/test_hash.c can compare
+ * the arch-optimized versions with the generic.
+ *
+ * Note that if you change these, any <asm/hash.h> that aren't updated
+ * to match need to have their HAVE_ARCH_* define values updated so the
+ * self-test will not false-positive.
+ */
+#ifndef HAVE_ARCH__HASH_32
+#define __hash_32 __hash_32_generic
+#endif
+static inline u32 __hash_32_generic(u32 val)
 {
 	return val * GOLDEN_RATIO_32;
 }
 
-static inline u32 hash_32(u32 val, unsigned int bits)
+#ifndef HAVE_ARCH_HASH_32
+#define hash_32 hash_32_generic
+#endif
+static inline u32 hash_32_generic(u32 val, unsigned int bits)
 {
 	/* High bits are more random, so use them. */
 	return __hash_32(val) >> (32 - bits);
 }
 
-static __always_inline u32 hash_64(u64 val, unsigned int bits)
+#ifndef HAVE_ARCH_HASH_64
+#define hash_64 hash_64_generic
+#endif
+static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
 {
 #if BITS_PER_LONG == 64
 	/* 64x64-bit multiply is efficient on all 64-bit processors */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..18ec69ba8eb6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1815,6 +1815,17 @@ config TEST_RHASHTABLE
 
 	  If unsure, say N.
 
+config TEST_HASH
+	tristate "Perform selftest on hash functions"
+	default n
+	help
+	  Enable this option to test the kernel's integer (<linux/hash,h>)
+	  and string (<linux/stringhash.h>) hash functions on boot
+	  (or module load).
+
+	  This is intended to help people writing architecture-specific
+	  optimized versions.  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..f80b1a1b3afd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
diff --git a/lib/test_hash.c b/lib/test_hash.c
new file mode 100644
index 000000000000..c9549c8b4909
--- /dev/null
+++ b/lib/test_hash.c
@@ -0,0 +1,250 @@
+/*
+ * Test cases for <linux/hash.h> and <linux/stringhash.h>
+ * This just verifies that various ways of computing a hash
+ * produce the same thing and, for cases where a k-bit hash
+ * value is requested, is of the requested size.
+ *
+ * We fill a buffer with a 255-byte null-terminated string,
+ * and use both full_name_hash() and hashlen_string() to hash the
+ * substrings from i to j, where 0 <= i < j < 256.
+ *
+ * The returned values are used to check that __hash_32() and
+ * __hash_32_generic() compute the same thing.  Likewise hash_32()
+ * and hash_64().
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n"
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/stringhash.h>
+#include <linux/printk.h>
+
+/* 32-bit XORSHIFT generator.  Seed must not be zero. */
+static u32 __init __attribute_const__
+xorshift(u32 seed)
+{
+	seed ^= seed << 13;
+	seed ^= seed >> 17;
+	seed ^= seed << 5;
+	return seed;
+}
+
+/* Given a non-zero x, returns a non-zero byte. */
+static u8 __init __attribute_const__
+mod255(u32 x)
+{
+	x = (x & 0xffff) + (x >> 16);	/* 1 <= x <= 0x1fffe */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0x2fd */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0x100 */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0xff */
+	return x;
+}
+
+/* Fill the buffer with non-zero bytes. */
+static void __init
+fill_buf(char *buf, size_t len, u32 seed)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		seed = xorshift(seed);
+		buf[i] = mod255(seed);
+	}
+}
+
+/*
+ * Test the various integer hash functions.  h64 (or its low-order bits)
+ * is the integer to hash.  hash_or accumulates the OR of the hash values,
+ * which are later checked to see that they cover all the requested bits.
+ *
+ * Because these functions (as opposed to the string hashes) are all
+ * inline, the code being tested is actually in the module, and you can
+ * recompile and re-test the module without rebooting.
+ */
+static bool __init
+test_int_hash(unsigned long long h64, u32 hash_or[2][33])
+{
+	int k;
+	u32 h0 = (u32)h64, h1, h2;
+
+	/* Test __hash32 */
+	hash_or[0][0] |= h1 = __hash_32(h0);
+#ifdef HAVE_ARCH__HASH_32
+	hash_or[1][0] |= h2 = __hash_32_generic(h0);
+#if HAVE_ARCH__HASH_32 == 1
+	if (h1 != h2) {
+		pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x",
+			h0, h1, h2);
+		return false;
+	}
+#endif
+#endif
+
+	/* Test k = 1..32 bits */
+	for (k = 1; k <= 32; k++) {
+		u32 const m = ((u32)2 << (k-1)) - 1;	/* Low k bits set */
+
+		/* Test hash_32 */
+		hash_or[0][k] |= h1 = hash_32(h0, k);
+		if (h1 > m) {
+			pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m);
+			return false;
+		}
+#ifdef HAVE_ARCH_HASH_32
+		h2 = hash_32_generic(h0, k);
+#if HAVE_ARCH_HASH_32 == 1
+		if (h1 != h2) {
+			pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() "
+				" = %#x", h0, k, h1, h2);
+			return false;
+		}
+#else
+		if (h2 > m) {
+			pr_err("hash_32_generic(%#x, %d) = %#x > %#x",
+				h0, k, h1, m);
+			return false;
+		}
+#endif
+#endif
+		/* Test hash_64 */
+		hash_or[1][k] |= h1 = hash_64(h64, k);
+		if (h1 > m) {
+			pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m);
+			return false;
+		}
+#ifdef HAVE_ARCH_HASH_64
+		h2 = hash_64_generic(h64, k);
+#if HAVE_ARCH_HASH_64 == 1
+		if (h1 != h2) {
+			pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() "
+				"= %#x", h64, k, h1, h2);
+			return false;
+		}
+#else
+		if (h2 > m) {
+			pr_err("hash_64_generic(%#llx, %d) = %#x > %#x",
+				h64, k, h1, m);
+			return false;
+		}
+#endif
+#endif
+	}
+
+	(void)h2;	/* Suppress unused variable warning */
+	return true;
+}
+
+#define SIZE 256	/* Run time is cubic in SIZE */
+
+static int __init
+test_hash_init(void)
+{
+	char buf[SIZE+1];
+	u32 string_or = 0, hash_or[2][33] = { 0 };
+	unsigned tests = 0;
+	unsigned long long h64 = 0;
+	int i, j;
+
+	fill_buf(buf, SIZE, 1);
+
+	/* Test every possible non-empty substring in the buffer. */
+	for (j = SIZE; j > 0; --j) {
+		buf[j] = '\0';
+
+		for (i = 0; i <= j; i++) {
+			u64 hashlen = hashlen_string(buf+i);
+			u32 h0 = full_name_hash(buf+i, j-i);
+
+			/* Check that hashlen_string gets the length right */
+			if (hashlen_len(hashlen) != j-i) {
+				pr_err("hashlen_string(%d..%d) returned length"
+					" %u, expected %d",
+					i, j, hashlen_len(hashlen), j-i);
+				return -EINVAL;
+			}
+			/* Check that the hashes match */
+			if (hashlen_hash(hashlen) != h0) {
+				pr_err("hashlen_string(%d..%d) = %08x != "
+					"full_name_hash() = %08x",
+					i, j, hashlen_hash(hashlen), h0);
+				return -EINVAL;
+			}
+
+			string_or |= h0;
+			h64 = h64 << 32 | h0;	/* For use with hash_64 */
+			if (!test_int_hash(h64, hash_or))
+				return -EINVAL;
+			tests++;
+		} /* i */
+	} /* j */
+
+	/* The OR of all the hash values should cover all the bits */
+	if (~string_or) {
+		pr_err("OR of all string hash results = %#x != %#x",
+			string_or, -1u);
+		return -EINVAL;
+	}
+	if (~hash_or[0][0]) {
+		pr_err("OR of all __hash_32 results = %#x != %#x",
+			hash_or[0][0], -1u);
+		return -EINVAL;
+	}
+#ifdef HAVE_ARCH__HASH_32
+#if HAVE_ARCH__HASH_32 != 1	/* Test is pointless if results match */
+	if (~hash_or[1][0]) {
+		pr_err("OR of all __hash_32_generic results = %#x != %#x",
+			hash_or[1][0], -1u);
+		return -EINVAL;
+	}
+#endif
+#endif
+
+	/* Likewise for all the i-bit hash values */
+	for (i = 1; i <= 32; i++) {
+		u32 const m = ((u32)2 << (i-1)) - 1;	/* Low i bits set */
+
+		if (hash_or[0][i] != m) {
+			pr_err("OR of all hash_32(%d) results = %#x "
+				"(%#x expected)", i, hash_or[0][i], m);
+			return -EINVAL;
+		}
+		if (hash_or[1][i] != m) {
+			pr_err("OR of all hash_64(%d) results = %#x "
+				"(%#x expected)", i, hash_or[1][i], m);
+			return -EINVAL;
+		}
+	}
+
+	/* Issue notices about skipped tests. */
+#ifndef HAVE_ARCH__HASH_32
+	pr_info("__hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH__HASH_32 != 1
+	pr_info("__hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_32
+	pr_info("hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_32 != 1
+	pr_info("hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_64
+	pr_info("hash_64() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_64 != 1
+	pr_info("hash_64() is arch-specific; not compared to generic.");
+#endif
+
+	pr_notice("%u tests passed.", tests);
+
+	return 0;
+}
+
+static void __exit test_hash_exit(void)
+{
+}
+
+module_init(test_hash_init);	/* Does everything */
+module_exit(test_hash_exit);	/* Does nothing */
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 65925b65ed98ffdb277cf5ea1af45731dac0b30b Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 21 May 2016 20:43:16 +0200
Subject: iio: st_sensors: switch to a threaded interrupt

commit 98ad8b41f58dff6b30713d7f09ae3834b8df7ded
("iio: st_sensors: verify interrupt event to status") caused
a regression when reading ST sensors from a HRTimer trigger
rather than the intrinsic interrupts: the HRTimer may
trigger faster than the sensor provides new values, and
as the check against new values available as a cause of
the interrupt trigger was done in the poll function,
this would bail out of the HRTimer interrupt with
IRQ_NONE.

So clearly we need to only check the new values available
from the proper interrupt handler and not from the poll
function, which should rather just read the raw values
from the registers, put them into the buffer and be happy.

To achieve this: switch the ST Sensors over to using a true
threaded interrupt handler.

In the interrupt thread, check if new values are available,
else yield to the (potential) next device on the same
interrupt line to check the registers. If the interrupt
was ours, proceed to poll the values.

Instead of relying on iio_trigger_generic_data_rdy_poll() as
a top half to wake up the thread that polls the sensor for
new data, have the thread call iio_trigger_poll_chained()
after determining that is is the proper source of the
interrupt. This is modelled on drivers/iio/accel/mma8452.c
which is already using a properly threaded interrupt handler.

In order to get the same precision in timestamps as
previously, where samples would be timestamped in the
poll function pf->timestamp when calling
iio_trigger_generic_data_rdy_poll() we introduce a
local timestamp in the sensor data, set it in the top half
(fastpath) of the interrupt handler and provide that to the
core when calling iio_push_to_buffers_with_timestamp().

Additionally: if the active scanmask is not set for the
sensor no IRQs should be enabled and we need to bail out
with IRQ_NONE. This can happen if spurious IRQs fire when
installing the threaded interrupt handler.

Tested with hard interrupt triggers on LIS331DL, then also
tested with hrtimers on the same sensor by creating a 75Hz
HRTimer and using it to poll the sensor.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Giuseppe Barba <giuseppe.barba@st.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Reported-by: Crestez Dan Leonard <cdleonard@gmail.com>
Tested-by: Crestez Dan Leonard <cdleonard@gmail.com>
Tested-by: Jonathan Cameron <jic23@kernel.org>
Fixes: 97865fe41322 ("iio: st_sensors: verify interrupt event to status")
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/st_accel_buffer.c                |  2 +-
 drivers/iio/accel/st_accel_core.c                  |  1 +
 drivers/iio/common/st_sensors/st_sensors_buffer.c  | 25 ++----
 drivers/iio/common/st_sensors/st_sensors_core.c    |  3 +
 drivers/iio/common/st_sensors/st_sensors_trigger.c | 88 +++++++++++++++++++++-
 drivers/iio/gyro/st_gyro_buffer.c                  |  2 +-
 drivers/iio/gyro/st_gyro_core.c                    |  1 +
 drivers/iio/magnetometer/st_magn_buffer.c          |  2 +-
 drivers/iio/magnetometer/st_magn_core.c            |  1 +
 drivers/iio/pressure/st_pressure_buffer.c          |  2 +-
 drivers/iio/pressure/st_pressure_core.c            |  1 +
 include/linux/iio/common/st_sensors.h              |  9 ++-
 12 files changed, 111 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_buffer.c b/drivers/iio/accel/st_accel_buffer.c
index a1e642ee13d6..7fddc137e91e 100644
--- a/drivers/iio/accel/st_accel_buffer.c
+++ b/drivers/iio/accel/st_accel_buffer.c
@@ -91,7 +91,7 @@ static const struct iio_buffer_setup_ops st_accel_buffer_setup_ops = {
 
 int st_accel_allocate_ring(struct iio_dev *indio_dev)
 {
-	return iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
+	return iio_triggered_buffer_setup(indio_dev, NULL,
 		&st_sensors_trigger_handler, &st_accel_buffer_setup_ops);
 }
 
diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index dc73f2d85e6d..4d95bfc4786c 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -741,6 +741,7 @@ static const struct iio_info accel_info = {
 static const struct iio_trigger_ops st_accel_trigger_ops = {
 	.owner = THIS_MODULE,
 	.set_trigger_state = ST_ACCEL_TRIGGER_SET_STATE,
+	.validate_device = st_sensors_validate_device,
 };
 #define ST_ACCEL_TRIGGER_OPS (&st_accel_trigger_ops)
 #else
diff --git a/drivers/iio/common/st_sensors/st_sensors_buffer.c b/drivers/iio/common/st_sensors/st_sensors_buffer.c
index c55898543a47..f1693dbebb8a 100644
--- a/drivers/iio/common/st_sensors/st_sensors_buffer.c
+++ b/drivers/iio/common/st_sensors/st_sensors_buffer.c
@@ -57,31 +57,20 @@ irqreturn_t st_sensors_trigger_handler(int irq, void *p)
 	struct iio_poll_func *pf = p;
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
+	s64 timestamp;
 
-	/* If we have a status register, check if this IRQ came from us */
-	if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) {
-		u8 status;
-
-		len = sdata->tf->read_byte(&sdata->tb, sdata->dev,
-			   sdata->sensor_settings->drdy_irq.addr_stat_drdy,
-			   &status);
-		if (len < 0)
-			dev_err(sdata->dev, "could not read channel status\n");
-
-		/*
-		 * If this was not caused by any channels on this sensor,
-		 * return IRQ_NONE
-		 */
-		if (!(status & (u8)indio_dev->active_scan_mask[0]))
-			return IRQ_NONE;
-	}
+	/* If we do timetamping here, do it before reading the values */
+	if (sdata->hw_irq_trigger)
+		timestamp = sdata->hw_timestamp;
+	else
+		timestamp = iio_get_time_ns();
 
 	len = st_sensors_get_buffer_element(indio_dev, sdata->buffer_data);
 	if (len < 0)
 		goto st_sensors_get_buffer_element_error;
 
 	iio_push_to_buffers_with_timestamp(indio_dev, sdata->buffer_data,
-		pf->timestamp);
+					   timestamp);
 
 st_sensors_get_buffer_element_error:
 	iio_trigger_notify_done(indio_dev->trig);
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index dffe00692169..928ee68fcc5f 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -424,6 +424,9 @@ int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable)
 	else
 		drdy_mask = sdata->sensor_settings->drdy_irq.mask_int2;
 
+	/* Flag to the poll function that the hardware trigger is in use */
+	sdata->hw_irq_trigger = enable;
+
 	/* Enable/Disable the interrupt generator for data ready. */
 	err = st_sensors_write_data_with_mask(indio_dev,
 					sdata->sensor_settings->drdy_irq.addr,
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index da72279fcf99..1f59bcc0f143 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -17,6 +17,73 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_sensors_core.h"
 
+/**
+ * st_sensors_irq_handler() - top half of the IRQ-based triggers
+ * @irq: irq number
+ * @p: private handler data
+ */
+irqreturn_t st_sensors_irq_handler(int irq, void *p)
+{
+	struct iio_trigger *trig = p;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
+	struct st_sensor_data *sdata = iio_priv(indio_dev);
+
+	/* Get the time stamp as close in time as possible */
+	sdata->hw_timestamp = iio_get_time_ns();
+	return IRQ_WAKE_THREAD;
+}
+
+/**
+ * st_sensors_irq_thread() - bottom half of the IRQ-based triggers
+ * @irq: irq number
+ * @p: private handler data
+ */
+irqreturn_t st_sensors_irq_thread(int irq, void *p)
+{
+	struct iio_trigger *trig = p;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
+	struct st_sensor_data *sdata = iio_priv(indio_dev);
+	int ret;
+
+	/*
+	 * If this trigger is backed by a hardware interrupt and we have a
+	 * status register, check if this IRQ came from us
+	 */
+	if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) {
+		u8 status;
+
+		ret = sdata->tf->read_byte(&sdata->tb, sdata->dev,
+			   sdata->sensor_settings->drdy_irq.addr_stat_drdy,
+			   &status);
+		if (ret < 0) {
+			dev_err(sdata->dev, "could not read channel status\n");
+			goto out_poll;
+		}
+		/*
+		 * the lower bits of .active_scan_mask[0] is directly mapped
+		 * to the channels on the sensor: either bit 0 for
+		 * one-dimensional sensors, or e.g. x,y,z for accelerometers,
+		 * gyroscopes or magnetometers. No sensor use more than 3
+		 * channels, so cut the other status bits here.
+		 */
+		status &= 0x07;
+
+		/*
+		 * If this was not caused by any channels on this sensor,
+		 * return IRQ_NONE
+		 */
+		if (!indio_dev->active_scan_mask)
+			return IRQ_NONE;
+		if (!(status & (u8)indio_dev->active_scan_mask[0]))
+			return IRQ_NONE;
+	}
+
+out_poll:
+	/* It's our IRQ: proceed to handle the register polling */
+	iio_trigger_poll_chained(p);
+	return IRQ_HANDLED;
+}
+
 int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 				const struct iio_trigger_ops *trigger_ops)
 {
@@ -77,9 +144,12 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 	    sdata->sensor_settings->drdy_irq.addr_stat_drdy)
 		irq_trig |= IRQF_SHARED;
 
-	err = request_threaded_irq(irq,
-			iio_trigger_generic_data_rdy_poll,
-			NULL,
+	/* Let's create an interrupt thread masking the hard IRQ here */
+	irq_trig |= IRQF_ONESHOT;
+
+	err = request_threaded_irq(sdata->get_irq_data_ready(indio_dev),
+			st_sensors_irq_handler,
+			st_sensors_irq_thread,
 			irq_trig,
 			sdata->trig->name,
 			sdata->trig);
@@ -119,6 +189,18 @@ void st_sensors_deallocate_trigger(struct iio_dev *indio_dev)
 }
 EXPORT_SYMBOL(st_sensors_deallocate_trigger);
 
+int st_sensors_validate_device(struct iio_trigger *trig,
+			       struct iio_dev *indio_dev)
+{
+	struct iio_dev *indio = iio_trigger_get_drvdata(trig);
+
+	if (indio != indio_dev)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(st_sensors_validate_device);
+
 MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
 MODULE_DESCRIPTION("STMicroelectronics ST-sensors trigger");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/gyro/st_gyro_buffer.c b/drivers/iio/gyro/st_gyro_buffer.c
index d67b17b6a7aa..a5377044e42f 100644
--- a/drivers/iio/gyro/st_gyro_buffer.c
+++ b/drivers/iio/gyro/st_gyro_buffer.c
@@ -91,7 +91,7 @@ static const struct iio_buffer_setup_ops st_gyro_buffer_setup_ops = {
 
 int st_gyro_allocate_ring(struct iio_dev *indio_dev)
 {
-	return iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
+	return iio_triggered_buffer_setup(indio_dev, NULL,
 		&st_sensors_trigger_handler, &st_gyro_buffer_setup_ops);
 }
 
diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index 52a3c87c375c..a8012955a1f6 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c
@@ -409,6 +409,7 @@ static const struct iio_info gyro_info = {
 static const struct iio_trigger_ops st_gyro_trigger_ops = {
 	.owner = THIS_MODULE,
 	.set_trigger_state = ST_GYRO_TRIGGER_SET_STATE,
+	.validate_device = st_sensors_validate_device,
 };
 #define ST_GYRO_TRIGGER_OPS (&st_gyro_trigger_ops)
 #else
diff --git a/drivers/iio/magnetometer/st_magn_buffer.c b/drivers/iio/magnetometer/st_magn_buffer.c
index ecd3bd0a9769..0a9e8fadfa9d 100644
--- a/drivers/iio/magnetometer/st_magn_buffer.c
+++ b/drivers/iio/magnetometer/st_magn_buffer.c
@@ -82,7 +82,7 @@ static const struct iio_buffer_setup_ops st_magn_buffer_setup_ops = {
 
 int st_magn_allocate_ring(struct iio_dev *indio_dev)
 {
-	return iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
+	return iio_triggered_buffer_setup(indio_dev, NULL,
 		&st_sensors_trigger_handler, &st_magn_buffer_setup_ops);
 }
 
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 62036d2a9956..8250fc322c56 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -572,6 +572,7 @@ static const struct iio_info magn_info = {
 static const struct iio_trigger_ops st_magn_trigger_ops = {
 	.owner = THIS_MODULE,
 	.set_trigger_state = ST_MAGN_TRIGGER_SET_STATE,
+	.validate_device = st_sensors_validate_device,
 };
 #define ST_MAGN_TRIGGER_OPS (&st_magn_trigger_ops)
 #else
diff --git a/drivers/iio/pressure/st_pressure_buffer.c b/drivers/iio/pressure/st_pressure_buffer.c
index 2ff53f222352..99468d0a64e7 100644
--- a/drivers/iio/pressure/st_pressure_buffer.c
+++ b/drivers/iio/pressure/st_pressure_buffer.c
@@ -82,7 +82,7 @@ static const struct iio_buffer_setup_ops st_press_buffer_setup_ops = {
 
 int st_press_allocate_ring(struct iio_dev *indio_dev)
 {
-	return iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
+	return iio_triggered_buffer_setup(indio_dev, NULL,
 		&st_sensors_trigger_handler, &st_press_buffer_setup_ops);
 }
 
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 257b58ac6779..92a118c3c4ac 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -445,6 +445,7 @@ static const struct iio_info press_info = {
 static const struct iio_trigger_ops st_press_trigger_ops = {
 	.owner = THIS_MODULE,
 	.set_trigger_state = ST_PRESS_TRIGGER_SET_STATE,
+	.validate_device = st_sensors_validate_device,
 };
 #define ST_PRESS_TRIGGER_OPS (&st_press_trigger_ops)
 #else
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index d029ffac0d69..99403b19092f 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -223,6 +223,8 @@ struct st_sensor_settings {
  * @get_irq_data_ready: Function to get the IRQ used for data ready signal.
  * @tf: Transfer function structure used by I/O operations.
  * @tb: Transfer buffers and mutex used by I/O operations.
+ * @hw_irq_trigger: if we're using the hardware interrupt on the sensor.
+ * @hw_timestamp: Latest timestamp from the interrupt handler, when in use.
  */
 struct st_sensor_data {
 	struct device *dev;
@@ -247,6 +249,9 @@ struct st_sensor_data {
 
 	const struct st_sensor_transfer_function *tf;
 	struct st_sensor_transfer_buffer tb;
+
+	bool hw_irq_trigger;
+	s64 hw_timestamp;
 };
 
 #ifdef CONFIG_IIO_BUFFER
@@ -260,7 +265,8 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 				const struct iio_trigger_ops *trigger_ops);
 
 void st_sensors_deallocate_trigger(struct iio_dev *indio_dev);
-
+int st_sensors_validate_device(struct iio_trigger *trig,
+			       struct iio_dev *indio_dev);
 #else
 static inline int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 				const struct iio_trigger_ops *trigger_ops)
@@ -271,6 +277,7 @@ static inline void st_sensors_deallocate_trigger(struct iio_dev *indio_dev)
 {
 	return;
 }
+#define st_sensors_validate_device NULL
 #endif
 
 int st_sensors_init_sensor(struct iio_dev *indio_dev,
-- 
cgit v1.2.3


From b7ec35b304b64af2830027350cc99d31e6e537c2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:25 +0200
Subject: libceph: change ceph_osdmap_flag() to take osdc

For the benefit of every single caller, take osdc instead of map.
Also, now that osdc->osdmap can't ever be NULL, drop the check.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c                  |  8 ++++----
 include/linux/ceph/osd_client.h |  5 +++++
 include/linux/ceph/osdmap.h     |  5 -----
 net/ceph/osd_client.c           | 39 +++++++++++++++++++--------------------
 4 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a888df6f2d71..8eeb9f579db5 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1349,7 +1349,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 retry_snap:
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
 		err = -ENOSPC;
 		goto out;
 	}
@@ -1440,7 +1440,7 @@ retry_snap:
 	ceph_put_cap_refs(ci, got);
 
 	if (written >= 0) {
-		if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))
+		if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
 			iocb->ki_flags |= IOCB_DSYNC;
 
 		written = generic_write_sync(iocb, written);
@@ -1672,8 +1672,8 @@ static long ceph_fallocate(struct file *file, int mode,
 		goto unlock;
 	}
 
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
-		!(mode & FALLOC_FL_PUNCH_HOLE)) {
+	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
+	    !(mode & FALLOC_FL_PUNCH_HOLE)) {
 		ret = -ENOSPC;
 		goto unlock;
 	}
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 19b14862d3e0..1b3b6e155392 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -279,6 +279,11 @@ struct ceph_osd_client {
 	struct workqueue_struct	*notify_wq;
 };
 
+static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
+{
+	return osdc->osdmap->flags & flag;
+}
+
 extern int ceph_osdc_setup(void);
 extern void ceph_osdc_cleanup(void);
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index ddc426b22d81..9ccf4dbe55f8 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -189,11 +189,6 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 	return !ceph_osd_is_up(map, osd);
 }
 
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-	return map && (map->flags & flag);
-}
-
 extern char *ceph_osdmap_state_str(char *str, int len, int state);
 extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0160d7d09a1e..79c3bad87e62 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1276,9 +1276,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 				    const struct ceph_osd_request_target *t,
 				    struct ceph_pg_pool_info *pi)
 {
-	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-		       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+	bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+	bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+		       ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		       __pool_full(pi);
 
 	WARN_ON(pi->id != t->base_oloc.pool);
@@ -1303,8 +1303,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	bool force_resend = false;
 	bool need_check_tiering = false;
 	bool need_resend = false;
-	bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
-					     CEPH_OSDMAP_SORTBITWISE);
+	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
 	enum calc_target_result ct_res;
 	int ret;
 
@@ -1590,9 +1589,9 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 	verify_osdc_locked(osdc);
 	WARN_ON(!osdc->osdmap->epoch);
 
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
-	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
 		dout("%s osdc %p continuous\n", __func__, osdc);
 		continuous = true;
 	} else {
@@ -1629,19 +1628,19 @@ again:
 	}
 
 	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
 		dout("req %p pausewr\n", req);
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
 	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-		   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+		   ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
 		dout("req %p pauserd\n", req);
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
 	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
 		   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
 				     CEPH_OSD_FLAG_FULL_FORCE)) &&
-		   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		   (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		    pool_full(osdc, req->r_t.base_oloc.pool))) {
 		dout("req %p full/pool_full\n", req);
 		pr_warn_ratelimited("FULL or reached pool quota\n");
@@ -2280,7 +2279,7 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq)
 	struct ceph_osd_request *req = lreq->ping_req;
 	struct ceph_osd_req_op *op = &req->r_ops[0];
 
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
 		dout("%s PAUSERD\n", __func__);
 		return;
 	}
@@ -3050,7 +3049,7 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	bool skipped_map = false;
 	bool was_full;
 
-	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
 	set_pool_was_full(osdc);
 
 	if (incremental)
@@ -3088,7 +3087,7 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 		osdc->osdmap = newmap;
 	}
 
-	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
 	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
 		      need_resend, need_resend_linger);
 
@@ -3174,9 +3173,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	if (ceph_check_fsid(osdc->client, &fsid) < 0)
 		goto bad;
 
-	was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-	was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-		      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+	was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+	was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+		      ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		      have_pool_full(osdc);
 
 	/* incremental maps */
@@ -3238,9 +3237,9 @@ done:
 	 * we find out when we are no longer full and stop returning
 	 * ENOSPC.
 	 */
-	pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-	pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-		  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+	pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+		  ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		  have_pool_full(osdc);
 	if (was_pauserd || was_pausewr || pauserd || pausewr)
 		maybe_request_map(osdc);
-- 
cgit v1.2.3


From f1bddbb3de60872acc2446eee97dbeb0a6d57acb Mon Sep 17 00:00:00 2001
From: Krzysztof Opasiak <k.opasiak@samsung.com>
Date: Thu, 5 May 2016 10:46:05 +0200
Subject: usb: gadget: Fix binding to UDC via configfs interface

By default user could store only valid UDC name in configfs UDC
attr by doing:

echo $UDC_NAME > UDC

Commit (855ed04 "usb: gadget: udc-core: independent registration of
gadgets and gadget drivers") broke this behavior and allowed to store
any arbitrary string in UDC file and udc core was waiting for such
controller to appear.

echo "any arbitrary string here" > UDC

This commit fix this by adding a flag which prevents configfs
gadget from being added to list of pending drivers if UDC with
given name has not been found.

Signed-off-by: Krzysztof Opasiak <k.opasiak@samsung.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/configfs.c     |  1 +
 drivers/usb/gadget/udc/udc-core.c | 12 ++++++++----
 include/linux/usb/gadget.h        |  3 +++
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index b6f60ca8a035..70cf3477f951 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1401,6 +1401,7 @@ static const struct usb_gadget_driver configfs_driver_template = {
 		.owner          = THIS_MODULE,
 		.name		= "configfs-gadget",
 	},
+	.match_existing_only = 1,
 };
 
 static struct config_group *gadgets_make(
diff --git a/drivers/usb/gadget/udc/udc-core.c b/drivers/usb/gadget/udc/udc-core.c
index 6e8300d6a737..e1b2dcebdc2e 100644
--- a/drivers/usb/gadget/udc/udc-core.c
+++ b/drivers/usb/gadget/udc/udc-core.c
@@ -603,11 +603,15 @@ int usb_gadget_probe_driver(struct usb_gadget_driver *driver)
 		}
 	}
 
-	list_add_tail(&driver->pending, &gadget_driver_pending_list);
-	pr_info("udc-core: couldn't find an available UDC - added [%s] to list of pending drivers\n",
-		driver->function);
+	if (!driver->match_existing_only) {
+		list_add_tail(&driver->pending, &gadget_driver_pending_list);
+		pr_info("udc-core: couldn't find an available UDC - added [%s] to list of pending drivers\n",
+			driver->function);
+		ret = 0;
+	}
+
 	mutex_unlock(&udc_lock);
-	return 0;
+	return ret;
 found:
 	ret = udc_bind_to_driver(udc, driver);
 	mutex_unlock(&udc_lock);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 457651bf45b0..fefe8b06a63d 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -1034,6 +1034,8 @@ static inline int usb_gadget_activate(struct usb_gadget *gadget)
  * @udc_name: A name of UDC this driver should be bound to. If udc_name is NULL,
  *	this driver will be bound to any available UDC.
  * @pending: UDC core private data used for deferred probe of this driver.
+ * @match_existing_only: If udc is not found, return an error and don't add this
+ *      gadget driver to list of pending driver
  *
  * Devices are disabled till a gadget driver successfully bind()s, which
  * means the driver will handle setup() requests needed to enumerate (and
@@ -1097,6 +1099,7 @@ struct usb_gadget_driver {
 
 	char			*udc_name;
 	struct list_head	pending;
+	unsigned                match_existing_only:1;
 };
 
 
-- 
cgit v1.2.3


From e2082e3ab801b989d8d5337b2ecbfc61d09781cb Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 31 Mar 2016 16:26:50 -0400
Subject: dma-buf: headerdoc fixes

Apparently nobody noticed that dma-buf.h wasn't actually pulled into
docbook build.  And as a result the headerdoc comments bitrot a bit.
Add missing params/fields.

Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 include/linux/dma-buf.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 3fe90d494edb..4551c6f2a6c4 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -112,19 +112,24 @@ struct dma_buf_ops {
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
+ * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap
+ * @vmapping_counter: used internally to refcnt the vmaps
+ * @vmap_ptr: the current vmap ptr if vmapping_counter > 0
  * @exp_name: name of the exporter; useful for debugging.
  * @owner: pointer to exporter module; used for refcounting when exporter is a
  *         kernel module.
  * @list_node: node for dma_buf accounting and debugging.
  * @priv: exporter specific private data for this buffer object.
  * @resv: reservation object linked to this dma-buf
+ * @poll: for userspace poll support
+ * @cb_excl: for userspace poll support
+ * @cb_shared: for userspace poll support
  */
 struct dma_buf {
 	size_t size;
 	struct file *file;
 	struct list_head attachments;
 	const struct dma_buf_ops *ops;
-	/* mutex to serialize list manipulation, attach/detach and vmap/unmap */
 	struct mutex lock;
 	unsigned vmapping_counter;
 	void *vmap_ptr;
@@ -188,9 +193,11 @@ struct dma_buf_export_info {
 
 /**
  * helper macro for exporters; zeros and fills in most common values
+ *
+ * @name: export-info name
  */
-#define DEFINE_DMA_BUF_EXPORT_INFO(a)	\
-	struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME, \
+#define DEFINE_DMA_BUF_EXPORT_INFO(name)	\
+	struct dma_buf_export_info name = { .exp_name = KBUILD_MODNAME, \
 					 .owner = THIS_MODULE }
 
 /**
-- 
cgit v1.2.3


From dad6c3945fd25384c2b92306a90ba033e1130428 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 31 Mar 2016 16:26:51 -0400
Subject: reservation: add headerdoc comments

Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/dma-buf/reservation.c | 72 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/reservation.h   | 53 +++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index c0bd5722c997..9566a62ad8e3 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -35,6 +35,17 @@
 #include <linux/reservation.h>
 #include <linux/export.h>
 
+/**
+ * DOC: Reservation Object Overview
+ *
+ * The reservation object provides a mechanism to manage shared and
+ * exclusive fences associated with a buffer.  A reservation object
+ * can have attached one exclusive fence (normally associated with
+ * write operations) or N shared fences (read operations).  The RCU
+ * mechanism is used to protect read access to fences from locked
+ * write-side updates.
+ */
+
 DEFINE_WW_CLASS(reservation_ww_class);
 EXPORT_SYMBOL(reservation_ww_class);
 
@@ -43,9 +54,17 @@ EXPORT_SYMBOL(reservation_seqcount_class);
 
 const char reservation_seqcount_string[] = "reservation_seqcount";
 EXPORT_SYMBOL(reservation_seqcount_string);
-/*
- * Reserve space to add a shared fence to a reservation_object,
- * must be called with obj->lock held.
+
+/**
+ * reservation_object_reserve_shared - Reserve space to add a shared
+ * fence to a reservation_object.
+ * @obj: reservation object
+ *
+ * Should be called before reservation_object_add_shared_fence().  Must
+ * be called with obj->lock held.
+ *
+ * RETURNS
+ * Zero for success, or -errno
  */
 int reservation_object_reserve_shared(struct reservation_object *obj)
 {
@@ -180,7 +199,11 @@ done:
 		fence_put(old_fence);
 }
 
-/*
+/**
+ * reservation_object_add_shared_fence - Add a fence to a shared slot
+ * @obj: the reservation object
+ * @fence: the shared fence to add
+ *
  * Add a fence to a shared slot, obj->lock must be held, and
  * reservation_object_reserve_shared_fence has been called.
  */
@@ -200,6 +223,13 @@ void reservation_object_add_shared_fence(struct reservation_object *obj,
 }
 EXPORT_SYMBOL(reservation_object_add_shared_fence);
 
+/**
+ * reservation_object_add_excl_fence - Add an exclusive fence.
+ * @obj: the reservation object
+ * @fence: the shared fence to add
+ *
+ * Add a fence to the exclusive slot.  The obj->lock must be held.
+ */
 void reservation_object_add_excl_fence(struct reservation_object *obj,
 				       struct fence *fence)
 {
@@ -233,6 +263,18 @@ void reservation_object_add_excl_fence(struct reservation_object *obj,
 }
 EXPORT_SYMBOL(reservation_object_add_excl_fence);
 
+/**
+ * reservation_object_get_fences_rcu - Get an object's shared and exclusive
+ * fences without update side lock held
+ * @obj: the reservation object
+ * @pfence_excl: the returned exclusive fence (or NULL)
+ * @pshared_count: the number of shared fences returned
+ * @pshared: the array of shared fence ptrs returned (array is krealloc'd to
+ * the required size, and must be freed by caller)
+ *
+ * RETURNS
+ * Zero or -errno
+ */
 int reservation_object_get_fences_rcu(struct reservation_object *obj,
 				      struct fence **pfence_excl,
 				      unsigned *pshared_count,
@@ -319,6 +361,18 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(reservation_object_get_fences_rcu);
 
+/**
+ * reservation_object_wait_timeout_rcu - Wait on reservation's objects
+ * shared and/or exclusive fences.
+ * @obj: the reservation object
+ * @wait_all: if true, wait on all fences, else wait on just exclusive fence
+ * @intr: if true, do interruptible wait
+ * @timeout: timeout value in jiffies or zero to return immediately
+ *
+ * RETURNS
+ * Returns -ERESTARTSYS if interrupted, 0 if the wait timed out, or
+ * greater than zer on success.
+ */
 long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
 					 bool wait_all, bool intr,
 					 unsigned long timeout)
@@ -416,6 +470,16 @@ reservation_object_test_signaled_single(struct fence *passed_fence)
 	return ret;
 }
 
+/**
+ * reservation_object_test_signaled_rcu - Test if a reservation object's
+ * fences have been signaled.
+ * @obj: the reservation object
+ * @test_all: if true, test all fences, otherwise only test the exclusive
+ * fence
+ *
+ * RETURNS
+ * true if all fences signaled, else false
+ */
 bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
 					  bool test_all)
 {
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 49d057655d62..b0f305e77b7f 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -49,12 +49,27 @@ extern struct ww_class reservation_ww_class;
 extern struct lock_class_key reservation_seqcount_class;
 extern const char reservation_seqcount_string[];
 
+/**
+ * struct reservation_object_list - a list of shared fences
+ * @rcu: for internal use
+ * @shared_count: table of shared fences
+ * @shared_max: for growing shared fence table
+ * @shared: shared fence table
+ */
 struct reservation_object_list {
 	struct rcu_head rcu;
 	u32 shared_count, shared_max;
 	struct fence __rcu *shared[];
 };
 
+/**
+ * struct reservation_object - a reservation object manages fences for a buffer
+ * @lock: update side lock
+ * @seq: sequence count for managing RCU read-side synchronization
+ * @fence_excl: the exclusive fence, if there is one currently
+ * @fence: list of current shared fences
+ * @staged: staged copy of shared fences for RCU updates
+ */
 struct reservation_object {
 	struct ww_mutex lock;
 	seqcount_t seq;
@@ -68,6 +83,10 @@ struct reservation_object {
 #define reservation_object_assert_held(obj) \
 	lockdep_assert_held(&(obj)->lock.base)
 
+/**
+ * reservation_object_init - initialize a reservation object
+ * @obj: the reservation object
+ */
 static inline void
 reservation_object_init(struct reservation_object *obj)
 {
@@ -79,6 +98,10 @@ reservation_object_init(struct reservation_object *obj)
 	obj->staged = NULL;
 }
 
+/**
+ * reservation_object_fini - destroys a reservation object
+ * @obj: the reservation object
+ */
 static inline void
 reservation_object_fini(struct reservation_object *obj)
 {
@@ -106,6 +129,14 @@ reservation_object_fini(struct reservation_object *obj)
 	ww_mutex_destroy(&obj->lock);
 }
 
+/**
+ * reservation_object_get_list - get the reservation object's
+ * shared fence list, with update-side lock held
+ * @obj: the reservation object
+ *
+ * Returns the shared fence list.  Does NOT take references to
+ * the fence.  The obj->lock must be held.
+ */
 static inline struct reservation_object_list *
 reservation_object_get_list(struct reservation_object *obj)
 {
@@ -113,6 +144,17 @@ reservation_object_get_list(struct reservation_object *obj)
 					 reservation_object_held(obj));
 }
 
+/**
+ * reservation_object_get_excl - get the reservation object's
+ * exclusive fence, with update-side lock held
+ * @obj: the reservation object
+ *
+ * Returns the exclusive fence (if any).  Does NOT take a
+ * reference.  The obj->lock must be held.
+ *
+ * RETURNS
+ * The exclusive fence or NULL
+ */
 static inline struct fence *
 reservation_object_get_excl(struct reservation_object *obj)
 {
@@ -120,6 +162,17 @@ reservation_object_get_excl(struct reservation_object *obj)
 					 reservation_object_held(obj));
 }
 
+/**
+ * reservation_object_get_excl_rcu - get the reservation object's
+ * exclusive fence, without lock held.
+ * @obj: the reservation object
+ *
+ * If there is an exclusive fence, this atomically increments it's
+ * reference count and returns it.
+ *
+ * RETURNS
+ * The exclusive fence or NULL if none
+ */
 static inline struct fence *
 reservation_object_get_excl_rcu(struct reservation_object *obj)
 {
-- 
cgit v1.2.3


From 4320c2a22df12f954edd4997f71ca3a4216312b2 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Mon, 11 Apr 2016 12:48:55 +0100
Subject: fence: add missing descriptions for fence

The members child_list and active_list were added to the fence struct
without descriptions for the Documentation. Adding these.

Fixes: b55b54b5db33 ("staging/android: remove struct sync_pt")
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Reviewed-by: Javier Martinez Canillas <javier@osg.samsung.com>
Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 include/linux/fence.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fence.h b/include/linux/fence.h
index 2b17698b60b8..2056e9fd0138 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -49,6 +49,8 @@ struct fence_cb;
  * @timestamp: Timestamp when the fence was signaled.
  * @status: Optional, only valid if < 0, must be set before calling
  * fence_signal, indicates that the fence has completed with an error.
+ * @child_list: list of children fences
+ * @active_list: list of active fences
  *
  * the flags member must be manipulated and read using the appropriate
  * atomic ops (bit_*), so taking the spinlock will not be needed most
-- 
cgit v1.2.3


From 40eb90e9ccc3f96f937ea1db79d0f9cb61553ed5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 29 May 2016 17:42:13 +0800
Subject: sctp: sctp_diag should dump sctp socket type

Now we cannot distinguish that one sk is a udp or sctp style when
we use ss to dump sctp_info. it's necessary to dump it as well.

For sctp_diag, ss support is not officially available, thus there
are no official users of this yet, so we can add this field in the
middle of sctp_info without breaking user API.

v1->v2:
  - move 'sctpi_s_type' field to the end of struct sctp_info, so
    that it won't cause incompatibility with applications already
    built.
  - add __reserved3 in sctp_info to make sure sctp_info is 8-byte
    alignment.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 2 ++
 net/sctp/socket.c    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index dacb5e711994..de1f64318fc4 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -765,6 +765,8 @@ struct sctp_info {
 	__u8	sctpi_s_disable_fragments;
 	__u8	sctpi_s_v4mapped;
 	__u8	sctpi_s_frag_interleave;
+	__u32	sctpi_s_type;
+	__u32	__reserved3;
 };
 
 struct sctp_infox {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 777d0324594a..67154b848aa9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4220,6 +4220,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 		info->sctpi_s_disable_fragments = sp->disable_fragments;
 		info->sctpi_s_v4mapped = sp->v4mapped;
 		info->sctpi_s_frag_interleave = sp->frag_interleave;
+		info->sctpi_s_type = sp->type;
 
 		return 0;
 	}
-- 
cgit v1.2.3


From 480ce08a70e4179f34808a3bdbfe6627f624cf54 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 20 May 2016 18:32:31 +0800
Subject: FS-Cache: make check_consistency callback return int

__fscache_check_consistency() calls check_consistency() callback
and return the callback's return value. But the return type of
check_consistency() is bool. So __fscache_check_consistency()
return 1 if the cache is inconsistent. This is inconsistent with
the document.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/cachefiles/interface.c     | 2 +-
 include/linux/fscache-cache.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 861d611b8c05..ce5f345d70f5 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
  * check if the backing cache is updated to FS-Cache
  * - called by FS-Cache when evaluates if need to invalidate the cache
  */
-static bool cachefiles_check_consistency(struct fscache_operation *op)
+static int cachefiles_check_consistency(struct fscache_operation *op)
 {
 	struct cachefiles_object *object;
 	struct cachefiles_cache *cache;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 604e1526cd00..13ba552e6c09 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -241,7 +241,7 @@ struct fscache_cache_ops {
 
 	/* check the consistency between the backing cache and the FS-Cache
 	 * cookie */
-	bool (*check_consistency)(struct fscache_operation *op);
+	int (*check_consistency)(struct fscache_operation *op);
 
 	/* store the updated auxiliary data on an object */
 	void (*update_object)(struct fscache_object *object);
-- 
cgit v1.2.3


From 5a5e78cd706ce0577ed6356634a34552c866cc10 Mon Sep 17 00:00:00 2001
From: Caesar Wang <wxt@rock-chips.com>
Date: Wed, 18 May 2016 23:01:39 +0800
Subject: thermal: add the note for set_trip_temp

Fixes commit 60f9ce3ada53
("thermal: of-thermal: allow setting trip_temp on hardware")

Signed-off-by: Caesar Wang <wxt@rock-chips.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: linux-pm@vger.kernel.org
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 include/linux/thermal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index e45abe7db9a6..ee517bef0db0 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -335,6 +335,8 @@ struct thermal_genl_event {
  * @get_trend: a pointer to a function that reads the sensor temperature trend.
  * @set_emul_temp: a pointer to a function that sets sensor emulated
  *		   temperature.
+ * @set_trip_temp: a pointer to a function that sets the trip temperature on
+ *		   hardware.
  */
 struct thermal_zone_of_device_ops {
 	int (*get_temp)(void *, int *);
-- 
cgit v1.2.3


From dfc2507b26af22b0bbc85251b8545b36d8bc5d72 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 1 Jun 2016 11:53:26 -0700
Subject: time: Make settimeofday error checking work again

In commit 86d3473224b0 some of the checking for a valid timeval
was subtley changed which caused -EINVAL to be returned whenever
the timeval was null.

However, it is possible to set the timezone data while specifying
a NULL timeval, which is usually done to handle systems where the
RTC keeps local time instead of UTC. Thus the patch causes such
systems to have the time incorrectly set.

This patch addresses the issue by handling the error conditionals
in the same way as was done previously.

Fixes: 86d3473224b0 "time: Introduce do_sys_settimeofday64()"
Reported-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Link: http://lkml.kernel.org/r/1464807207-16530-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timekeeping.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 37dbacf84849..816b7543f81b 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -21,6 +21,9 @@ static inline int do_sys_settimeofday(const struct timespec *tv,
 	struct timespec64 ts64;
 
 	if (!tv)
+		return do_sys_settimeofday64(NULL, tz);
+
+	if (!timespec_valid(tv))
 		return -EINVAL;
 
 	ts64 = timespec_to_timespec64(*tv);
-- 
cgit v1.2.3


From b9610e74586fd183b2d1c7fe5316bce8b6cc534f Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 1 Jun 2016 14:56:57 -0700
Subject: clk: Remove CLK_IS_ROOT flag

Now that we've gotten rid of all the users of this flag we can
retire the number, leaving a slot open for a future flag user.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 include/linux/clk-provider.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0c72204c75fc..fb39d5add173 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -25,7 +25,7 @@
 #define CLK_SET_PARENT_GATE	BIT(1) /* must be gated across re-parent */
 #define CLK_SET_RATE_PARENT	BIT(2) /* propagate rate change up one level */
 #define CLK_IGNORE_UNUSED	BIT(3) /* do not gate even if unused */
-#define CLK_IS_ROOT		BIT(4) /* Deprecated: Don't use */
+				/* unused */
 #define CLK_IS_BASIC		BIT(5) /* Basic clk, can't do a to_clk_foo() */
 #define CLK_GET_RATE_NOCACHE	BIT(6) /* do not use the cached clk rate */
 #define CLK_SET_RATE_NO_REPARENT BIT(7) /* don't re-parent on rate change */
-- 
cgit v1.2.3


From 12b7db2bf8b88938798c60416172b53225207b1f Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 31 May 2016 10:05:19 -0500
Subject: usb: musb: Return error value from musb_mailbox

At least on n900 we have phy-twl4030-usb only generating cable
interrupts, and then have a separate USB PHY.

In order for musb to know the real cable status, we need to
clear any cached state until musb is ready. Otherwise the cable
status interrupts will get just ignored if the status does
not change from the initial state.

To do this, let's add a return value to musb_mailbox(), and
reset cached linkstat to MUSB_UNKNOWN on error. Sorry to cause
a bit of churn here, I should have added that already last time
patching musb_mailbox().

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Bin Liu <b-liu@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/phy/phy-twl4030-usb.c     | 14 ++++++++++----
 drivers/usb/musb/musb_core.c      |  7 ++++---
 drivers/usb/musb/musb_core.h      |  2 +-
 drivers/usb/musb/omap2430.c       |  8 +++++---
 drivers/usb/phy/phy-twl6030-usb.c | 12 +++++++++---
 include/linux/usb/musb.h          |  5 +++--
 6 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-twl4030-usb.c b/drivers/phy/phy-twl4030-usb.c
index 6b6af6cba454..d9b10a39a2cf 100644
--- a/drivers/phy/phy-twl4030-usb.c
+++ b/drivers/phy/phy-twl4030-usb.c
@@ -463,7 +463,8 @@ static int twl4030_phy_power_on(struct phy *phy)
 	twl4030_usb_set_mode(twl, twl->usb_mode);
 	if (twl->usb_mode == T2_USB_MODE_ULPI)
 		twl4030_i2c_access(twl, 0);
-	schedule_delayed_work(&twl->id_workaround_work, 0);
+	twl->linkstat = MUSB_UNKNOWN;
+	schedule_delayed_work(&twl->id_workaround_work, HZ);
 
 	return 0;
 }
@@ -537,6 +538,7 @@ static irqreturn_t twl4030_usb_irq(int irq, void *_twl)
 	struct twl4030_usb *twl = _twl;
 	enum musb_vbus_id_status status;
 	bool status_changed = false;
+	int err;
 
 	status = twl4030_usb_linkstat(twl);
 
@@ -567,7 +569,9 @@ static irqreturn_t twl4030_usb_irq(int irq, void *_twl)
 			pm_runtime_mark_last_busy(twl->dev);
 			pm_runtime_put_autosuspend(twl->dev);
 		}
-		musb_mailbox(status);
+		err = musb_mailbox(status);
+		if (err)
+			twl->linkstat = MUSB_UNKNOWN;
 	}
 
 	/* don't schedule during sleep - irq works right then */
@@ -595,7 +599,8 @@ static int twl4030_phy_init(struct phy *phy)
 	struct twl4030_usb *twl = phy_get_drvdata(phy);
 
 	pm_runtime_get_sync(twl->dev);
-	schedule_delayed_work(&twl->id_workaround_work, 0);
+	twl->linkstat = MUSB_UNKNOWN;
+	schedule_delayed_work(&twl->id_workaround_work, HZ);
 	pm_runtime_mark_last_busy(twl->dev);
 	pm_runtime_put_autosuspend(twl->dev);
 
@@ -763,7 +768,8 @@ static int twl4030_usb_remove(struct platform_device *pdev)
 	if (cable_present(twl->linkstat))
 		pm_runtime_put_noidle(twl->dev);
 	pm_runtime_mark_last_busy(twl->dev);
-	pm_runtime_put_sync_suspend(twl->dev);
+	pm_runtime_dont_use_autosuspend(&pdev->dev);
+	pm_runtime_put_sync(twl->dev);
 	pm_runtime_disable(twl->dev);
 
 	/* autogate 60MHz ULPI clock,
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index 23888d579e8b..6469eff4fc30 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -1679,7 +1679,7 @@ EXPORT_SYMBOL_GPL(musb_dma_completion);
 #define use_dma			0
 #endif
 
-static void (*musb_phy_callback)(enum musb_vbus_id_status status);
+static int (*musb_phy_callback)(enum musb_vbus_id_status status);
 
 /*
  * musb_mailbox - optional phy notifier function
@@ -1688,11 +1688,12 @@ static void (*musb_phy_callback)(enum musb_vbus_id_status status);
  * Optionally gets called from the USB PHY. Note that the USB PHY must be
  * disabled at the point the phy_callback is registered or unregistered.
  */
-void musb_mailbox(enum musb_vbus_id_status status)
+int musb_mailbox(enum musb_vbus_id_status status)
 {
 	if (musb_phy_callback)
-		musb_phy_callback(status);
+		return musb_phy_callback(status);
 
+	return -ENODEV;
 };
 EXPORT_SYMBOL_GPL(musb_mailbox);
 
diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
index 29473846b098..b55a776b03eb 100644
--- a/drivers/usb/musb/musb_core.h
+++ b/drivers/usb/musb/musb_core.h
@@ -215,7 +215,7 @@ struct musb_platform_ops {
 				dma_addr_t *dma_addr, u32 *len);
 	void	(*pre_root_reset_end)(struct musb *musb);
 	void	(*post_root_reset_end)(struct musb *musb);
-	void	(*phy_callback)(enum musb_vbus_id_status status);
+	int	(*phy_callback)(enum musb_vbus_id_status status);
 };
 
 /*
diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
index d312d42592d6..2c54f52ae386 100644
--- a/drivers/usb/musb/omap2430.c
+++ b/drivers/usb/musb/omap2430.c
@@ -180,22 +180,24 @@ static void omap2430_set_power(struct musb *musb, bool enabled, bool cable)
 	}
 }
 
-static void omap2430_musb_mailbox(enum musb_vbus_id_status status)
+static int omap2430_musb_mailbox(enum musb_vbus_id_status status)
 {
 	struct omap2430_glue	*glue = _glue;
 
 	if (!glue) {
 		pr_err("%s: musb core is not yet initialized\n", __func__);
-		return;
+		return -EPROBE_DEFER;
 	}
 	glue->status = status;
 
 	if (!glue_to_musb(glue)) {
 		pr_err("%s: musb core is not yet ready\n", __func__);
-		return;
+		return -EPROBE_DEFER;
 	}
 
 	schedule_work(&glue->omap_musb_mailbox_work);
+
+	return 0;
 }
 
 static void omap_musb_set_mailbox(struct omap2430_glue *glue)
diff --git a/drivers/usb/phy/phy-twl6030-usb.c b/drivers/usb/phy/phy-twl6030-usb.c
index 24e2b3cf1867..c66a447c3dfe 100644
--- a/drivers/usb/phy/phy-twl6030-usb.c
+++ b/drivers/usb/phy/phy-twl6030-usb.c
@@ -227,12 +227,16 @@ static irqreturn_t twl6030_usb_irq(int irq, void *_twl)
 			twl->asleep = 1;
 			status = MUSB_VBUS_VALID;
 			twl->linkstat = status;
-			musb_mailbox(status);
+			ret = musb_mailbox(status);
+			if (ret)
+				twl->linkstat = MUSB_UNKNOWN;
 		} else {
 			if (twl->linkstat != MUSB_UNKNOWN) {
 				status = MUSB_VBUS_OFF;
 				twl->linkstat = status;
-				musb_mailbox(status);
+				ret = musb_mailbox(status);
+				if (ret)
+					twl->linkstat = MUSB_UNKNOWN;
 				if (twl->asleep) {
 					regulator_disable(twl->usb3v3);
 					twl->asleep = 0;
@@ -264,7 +268,9 @@ static irqreturn_t twl6030_usbotg_irq(int irq, void *_twl)
 		twl6030_writeb(twl, TWL_MODULE_USB, 0x10, USB_ID_INT_EN_HI_SET);
 		status = MUSB_ID_GROUND;
 		twl->linkstat = status;
-		musb_mailbox(status);
+		ret = musb_mailbox(status);
+		if (ret)
+			twl->linkstat = MUSB_UNKNOWN;
 	} else  {
 		twl6030_writeb(twl, TWL_MODULE_USB, 0x10, USB_ID_INT_EN_HI_CLR);
 		twl6030_writeb(twl, TWL_MODULE_USB, 0x1, USB_ID_INT_EN_HI_SET);
diff --git a/include/linux/usb/musb.h b/include/linux/usb/musb.h
index 0b3da40a525e..d315c8907869 100644
--- a/include/linux/usb/musb.h
+++ b/include/linux/usb/musb.h
@@ -142,10 +142,11 @@ enum musb_vbus_id_status {
 };
 
 #if IS_ENABLED(CONFIG_USB_MUSB_HDRC)
-void musb_mailbox(enum musb_vbus_id_status status);
+int musb_mailbox(enum musb_vbus_id_status status);
 #else
-static inline void musb_mailbox(enum musb_vbus_id_status status)
+static inline int musb_mailbox(enum musb_vbus_id_status status)
 {
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3


From dd5f1b049dc139876801db3cdd0f20d21fd428cc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 2 Jun 2016 09:00:28 +0100
Subject: irqchip/gic-v3: Fix ICC_SGI1R_EL1.INTID decoding mask

The INTID mask is wrong, and is made a signed value, which has
nteresting effects in the KVM emulation. Let's sanitize it.

Cc: stable@vger.kernel.org
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index bfbd707de390..85a8c2acdef5 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -305,7 +305,7 @@
 #define ICC_SGI1R_AFFINITY_1_SHIFT	16
 #define ICC_SGI1R_AFFINITY_1_MASK	(0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
 #define ICC_SGI1R_SGI_ID_SHIFT		24
-#define ICC_SGI1R_SGI_ID_MASK		(0xff << ICC_SGI1R_SGI_ID_SHIFT)
+#define ICC_SGI1R_SGI_ID_MASK		(0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
 #define ICC_SGI1R_AFFINITY_2_SHIFT	32
 #define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
 #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
-- 
cgit v1.2.3


From fab0cdc30d81694d2d5524b24e42c43414971719 Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Thu, 12 May 2016 10:46:34 +0200
Subject: irqchip/gic-v3: Fix copy+paste mistakes in defines

ICC_SGI1R_AFFINITY_{2,3}_MASK are unused, which is good
because they were defined with the wrong shifts.

Signed-off-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 85a8c2acdef5..dc493e0f0ff7 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -307,10 +307,10 @@
 #define ICC_SGI1R_SGI_ID_SHIFT		24
 #define ICC_SGI1R_SGI_ID_MASK		(0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
 #define ICC_SGI1R_AFFINITY_2_SHIFT	32
-#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
 #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
 #define ICC_SGI1R_AFFINITY_3_SHIFT	48
-#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
 
 #include <asm/arch_gicv3.h>
 
-- 
cgit v1.2.3


From 9bd616e3dbedfc103f158197c8ad93678849b1ed Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 1 Jun 2016 18:52:16 +0100
Subject: cpuidle: Do not access cpuidle_devices when !CONFIG_CPU_IDLE

The cpuidle_devices per-CPU variable is only defined when CPU_IDLE is
enabled. Commit c8cc7d4de7a4 ("sched/idle: Reorganize the idle loop")
removed the #ifdef CONFIG_CPU_IDLE around cpuidle_idle_call() with the
compiler optimising away __this_cpu_read(cpuidle_devices). However, with
CONFIG_UBSAN && !CONFIG_CPU_IDLE, this optimisation no longer happens
and the kernel fails to link since cpuidle_devices is not defined.

This patch introduces an accessor function for the current CPU cpuidle
device (returning NULL when !CONFIG_CPU_IDLE) and uses it in
cpuidle_idle_call().

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: 4.5+ <stable@vger.kernel.org> # 4.5+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpuidle.h | 3 +++
 kernel/sched/idle.c     | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 786ad32631a6..07b83d32f66c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -152,6 +152,8 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
+static inline struct cpuidle_device *cpuidle_get_device(void)
+{return __this_cpu_read(cpuidle_devices); }
 #else
 static inline void disable_cpuidle(void) { }
 static inline bool cpuidle_not_available(struct cpuidle_driver *drv,
@@ -187,6 +189,7 @@ static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
+static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 #endif
 
 #if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..c5aeedf4e93a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  */
 static void cpuidle_idle_call(void)
 {
-	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_device *dev = cpuidle_get_device();
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
 
-- 
cgit v1.2.3


From 55eed755c6e30a89be3a791a6b0ad208aadd9bdc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 27 May 2016 13:11:17 +0200
Subject: locking/seqcount: Re-fix raw_read_seqcount_latch()

Commit 50755bc1c305 ("seqlock: fix raw_read_seqcount_latch()") broke
raw_read_seqcount_latch().

If you look at the comment that was modified; the thing that changes is
the seq count, not the latch pointer.

 * void latch_modify(struct latch_struct *latch, ...)
 * {
 *	smp_wmb();	<- Ensure that the last data[1] update is visible
 *	latch->seq++;
 *	smp_wmb();	<- Ensure that the seqcount update is visible
 *
 *	modify(latch->data[0], ...);
 *
 *	smp_wmb();	<- Ensure that the data[0] update is visible
 *	latch->seq++;
 *	smp_wmb();	<- Ensure that the seqcount update is visible
 *
 *	modify(latch->data[1], ...);
 * }
 *
 * The query will have a form like:
 *
 * struct entry *latch_query(struct latch_struct *latch, ...)
 * {
 *	struct entry *entry;
 *	unsigned seq, idx;
 *
 *	do {
 *		seq = lockless_dereference(latch->seq);

So here we have:

		seq = READ_ONCE(latch->seq);
		smp_read_barrier_depends();

Which is exactly what we want; the new code:

		seq = ({ p = READ_ONCE(latch);
			 smp_read_barrier_depends(); p })->seq;

is just wrong; because it looses the volatile read on seq, which can now
be torn or worse 'optimized'. And the read_depend barrier is also placed
wrong, we want it after the load of seq, to match the above data[]
up-to-date wmb()s.

Such that when we dereference latch->data[] below, we're guaranteed to
observe the right data.

 *
 *		idx = seq & 0x01;
 *		entry = data_query(latch->data[idx], ...);
 *
 *		smp_rmb();
 *	} while (seq != latch->seq);
 *
 *	return entry;
 * }

So yes, not passing a pointer is not pretty, but the code was correct,
and isn't anymore now.

Change to explicit READ_ONCE()+smp_read_barrier_depends() to avoid
confusion and allow strict lockless_dereference() checking.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 50755bc1c305 ("seqlock: fix raw_read_seqcount_latch()")
Link: http://lkml.kernel.org/r/20160527111117.GL3192@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/seqlock.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 7973a821ac58..ead97654c4e9 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -277,7 +277,10 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
-	return lockless_dereference(s)->sequence;
+	int seq = READ_ONCE(s->sequence);
+	/* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */
+	smp_read_barrier_depends();
+	return seq;
 }
 
 /**
@@ -331,7 +334,7 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  *	unsigned seq, idx;
  *
  *	do {
- *		seq = lockless_dereference(latch)->seq;
+ *		seq = raw_read_seqcount_latch(&latch->seq);
  *
  *		idx = seq & 0x01;
  *		entry = data_query(latch->data[idx], ...);
-- 
cgit v1.2.3


From 55f1ea15216a5a14c96738bd5284100a00ffa9dc Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 31 May 2016 11:23:43 +0100
Subject: efi: Fix for_each_efi_memory_desc_in_map() for empty memmaps

Commit:

  78ce248faa3c ("efi: Iterate over efi.memmap in for_each_efi_memory_desc()")

introduced a regression for systems booted with the 'noefi' kernel option.

In particular, I observed an early kernel hang in efi_find_mirror()'s
for_each_efi_memory_desc() call. As we don't have efi memmap on this
system we enter this iterator with the following parameters:

  efi.memmap.map = 0, efi.memmap.map_end = 0, efi.memmap.desc_size = 28

... then for_each_efi_memory_desc_in_map() does the following comparison:

  (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size);

... where md = 0, (m)->map_end = 0 and (m)->desc_size = 28 but when we subtract
something from a NULL pointer wrap around happens and we end up returning
invalid pointer and crash.

Fix it by using the correct pointer arithmetics.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Salter <msalter@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Fixes: 78ce248faa3c ("efi: Iterate over efi.memmap in for_each_efi_memory_desc()")
Link: http://lkml.kernel.org/r/1464690224-4503-2-git-send-email-matt@codeblueprint.co.uk
[ Made the changelog more readable. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/efi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index c2db3ca22217..f196dd0b0f2f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1005,7 +1005,7 @@ extern int efi_memattr_apply_permissions(struct mm_struct *mm,
 /* Iterate through an efi_memory_map */
 #define for_each_efi_memory_desc_in_map(m, md)				   \
 	for ((md) = (m)->map;						   \
-	     (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \
+	     ((void *)(md) + (m)->desc_size) <= (m)->map_end;		   \
 	     (md) = (void *)(md) + (m)->desc_size)
 
 /**
-- 
cgit v1.2.3


From e93aeeae0bf8cba43e05484ae4ad17213ac6c3a7 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 25 May 2016 01:15:04 +0000
Subject: of: add missing const for of_parse_phandle_with_args() in !CONFIG_OF

commit 93c667ca2598bd84f1bd3f2fa176af69707699fe
("of: *node argument to of_parse_phandle_with_args should be const")
changed to const for struct device node *np,
but it cares CONFIG_OF case only, !CONFIG_OF case need it too.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index c7292e8ea080..74eb28cadbef 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -614,7 +614,7 @@ static inline struct device_node *of_parse_phandle(const struct device_node *np,
 	return NULL;
 }
 
-static inline int of_parse_phandle_with_args(struct device_node *np,
+static inline int of_parse_phandle_with_args(const struct device_node *np,
 					     const char *list_name,
 					     const char *cells_name,
 					     int index,
-- 
cgit v1.2.3


From f86e4271978bd93db466d6a95dad4b0fdcdb04f6 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 3 Jun 2016 14:55:38 -0700
Subject: mm: check the return value of lookup_page_ext for all call sites

Per the discussion with Joonsoo Kim [1], we need check the return value
of lookup_page_ext() for all call sites since it might return NULL in
some cases, although it is unlikely, i.e.  memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

[akpm@linux-foundation.org: fix build-breaking typos]
[arnd@arndb.de: fix build problems from lookup_page_ext]
  Link: http://lkml.kernel.org/r/6285269.2CksypHdYp@wuerfel
[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1464023768-31025-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_idle.h | 43 ++++++++++++++++++++++++++++++++++++-------
 mm/page_alloc.c           |  6 ++++++
 mm/page_owner.c           | 26 ++++++++++++++++++++++++++
 mm/page_poison.c          |  8 +++++++-
 mm/vmstat.c               |  2 ++
 5 files changed, 77 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index bf268fa92c5b..fec40271339f 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops;
 
 static inline bool page_is_young(struct page *page)
 {
-	return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return false;
+
+	return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
 static inline void set_page_young(struct page *page)
 {
-	set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return;
+
+	set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
 static inline bool test_and_clear_page_young(struct page *page)
 {
-	return test_and_clear_bit(PAGE_EXT_YOUNG,
-				  &lookup_page_ext(page)->flags);
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return false;
+
+	return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
 static inline bool page_is_idle(struct page *page)
 {
-	return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return false;
+
+	return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
 static inline void set_page_idle(struct page *page)
 {
-	set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return;
+
+	set_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
 static inline void clear_page_idle(struct page *page)
 {
-	clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return;
+
+	clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 #endif /* CONFIG_64BIT */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8f3bfc435ee..d27e8b968ac3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
 		return;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
+
 	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 
 	INIT_LIST_HEAD(&page->lru);
@@ -673,6 +676,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
 		return;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
+
 	__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 
 	set_page_private(page, 0);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 792b56da13d8..c6cda3e36212 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -55,6 +55,8 @@ void __reset_page_owner(struct page *page, unsigned int order)
 
 	for (i = 0; i < (1 << order); i++) {
 		page_ext = lookup_page_ext(page + i);
+		if (unlikely(!page_ext))
+			continue;
 		__clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
 	}
 }
@@ -62,6 +64,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
 void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 {
 	struct page_ext *page_ext = lookup_page_ext(page);
+
 	struct stack_trace trace = {
 		.nr_entries = 0,
 		.max_entries = ARRAY_SIZE(page_ext->trace_entries),
@@ -69,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 		.skip = 3,
 	};
 
+	if (unlikely(!page_ext))
+		return;
+
 	save_stack_trace(&trace);
 
 	page_ext->order = order;
@@ -82,6 +88,8 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 void __set_page_owner_migrate_reason(struct page *page, int reason)
 {
 	struct page_ext *page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
 
 	page_ext->last_migrate_reason = reason;
 }
@@ -89,6 +97,12 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
 gfp_t __get_page_owner_gfp(struct page *page)
 {
 	struct page_ext *page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		/*
+		 * The caller just returns 0 if no valid gfp
+		 * So return 0 here too.
+		 */
+		return 0;
 
 	return page_ext->gfp_mask;
 }
@@ -99,6 +113,9 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 	struct page_ext *new_ext = lookup_page_ext(newpage);
 	int i;
 
+	if (unlikely(!old_ext || !new_ext))
+		return;
+
 	new_ext->order = old_ext->order;
 	new_ext->gfp_mask = old_ext->gfp_mask;
 	new_ext->nr_entries = old_ext->nr_entries;
@@ -193,6 +210,11 @@ void __dump_page_owner(struct page *page)
 	gfp_t gfp_mask = page_ext->gfp_mask;
 	int mt = gfpflags_to_migratetype(gfp_mask);
 
+	if (unlikely(!page_ext)) {
+		pr_alert("There is not page extension available.\n");
+		return;
+	}
+
 	if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
 		pr_alert("page_owner info is not active (free page?)\n");
 		return;
@@ -251,6 +273,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		}
 
 		page_ext = lookup_page_ext(page);
+		if (unlikely(!page_ext))
+			continue;
 
 		/*
 		 * Some pages could be missed by concurrent allocation or free,
@@ -317,6 +341,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 				continue;
 
 			page_ext = lookup_page_ext(page);
+			if (unlikely(!page_ext))
+				continue;
 
 			/* Maybe overraping zone */
 			if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 1eae5fad2446..2e647c65916b 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -54,6 +54,9 @@ static inline void set_page_poison(struct page *page)
 	struct page_ext *page_ext;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
+
 	__set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 
@@ -62,6 +65,9 @@ static inline void clear_page_poison(struct page *page)
 	struct page_ext *page_ext;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
+
 	__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 
@@ -70,7 +76,7 @@ bool page_is_poisoned(struct page *page)
 	struct page_ext *page_ext;
 
 	page_ext = lookup_page_ext(page);
-	if (!page_ext)
+	if (unlikely(!page_ext))
 		return false;
 
 	return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 77e42ef388c2..cb2a67bb4158 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1061,6 +1061,8 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 				continue;
 
 			page_ext = lookup_page_ext(page);
+			if (unlikely(!page_ext))
+				continue;
 
 			if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 				continue;
-- 
cgit v1.2.3


From eedf265aa003b4781de24cfed40a655a664457e6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Jun 2016 10:29:47 -0500
Subject: devpts: Make each mount of devpts an independent filesystem.

The /dev/ptmx device node is changed to lookup the directory entry "pts"
in the same directory as the /dev/ptmx device node was opened in.  If
there is a "pts" entry and that entry is a devpts filesystem /dev/ptmx
uses that filesystem.  Otherwise the open of /dev/ptmx fails.

The DEVPTS_MULTIPLE_INSTANCES configuration option is removed, so that
userspace can now safely depend on each mount of devpts creating a new
instance of the filesystem.

Each mount of devpts is now a separate and equal filesystem.

Reserved ttys are now available to all instances of devpts where the
mounter is in the initial mount namespace.

A new vfs helper path_pts is introduced that finds a directory entry
named "pts" in the directory of the passed in path, and changes the
passed in path to point to it.  The helper path_pts uses a function
path_parent_directory that was factored out of follow_dotdot.

In the implementation of devpts:
 - devpts_mnt is killed as it is no longer meaningful if all mounts of
   devpts are equal.
 - pts_sb_from_inode is replaced by just inode->i_sb as all cached
   inodes in the tty layer are now from the devpts filesystem.
 - devpts_add_ref is rolled into the new function devpts_ptmx.  And the
   unnecessary inode hold is removed.
 - devpts_del_ref is renamed devpts_release and reduced to just a
   deacrivate_super.
 - The newinstance mount option continues to be accepted but is now
   ignored.

In devpts_fs.h definitions for when !CONFIG_UNIX98_PTYS are removed as
they are never used.

Documentation/filesystems/devices.txt is updated to describe the current
situation.

This has been verified to work properly on openwrt-15.05, centos5,
centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2, ubuntu-14.04.3,
ubuntu-15.10, fedora23, magia-5, mint-17.3, opensuse-42.1,
slackware-14.1, gentoo-20151225 (13.0?), archlinux-2015-12-01.  With the
caveat that on centos6 and on slackware-14.1 that there wind up being
two instances of the devpts filesystem mounted on /dev/pts, the lower
copy does not end up getting used.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg KH <greg@kroah.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Serge Hallyn <serge.hallyn@ubuntu.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Jann Horn <jann@thejh.net>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Florian Weimer <fw@deneb.enyo.de>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/devpts.txt | 145 +++-----------------------
 drivers/tty/Kconfig                  |  11 --
 drivers/tty/pty.c                    |  15 +--
 fs/devpts/inode.c                    | 191 ++++++++++-------------------------
 fs/namei.c                           |  49 +++++++--
 include/linux/devpts_fs.h            |   9 +-
 include/linux/namei.h                |   2 +
 7 files changed, 126 insertions(+), 296 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
index 30d2fcb32f72..9f94fe276dea 100644
--- a/Documentation/filesystems/devpts.txt
+++ b/Documentation/filesystems/devpts.txt
@@ -1,141 +1,26 @@
+Each mount of the devpts filesystem is now distinct such that ptys
+and their indicies allocated in one mount are independent from ptys
+and their indicies in all other mounts.
 
-To support containers, we now allow multiple instances of devpts filesystem,
-such that indices of ptys allocated in one instance are independent of indices
-allocated in other instances of devpts.
+All mounts of the devpts filesystem now create a /dev/pts/ptmx node
+with permissions 0000.
 
-To preserve backward compatibility, this support for multiple instances is
-enabled only if:
+To retain backwards compatibility the a ptmx device node (aka any node
+created with "mknod name c 5 2") when opened will look for an instance
+of devpts under the name "pts" in the same directory as the ptmx device
+node.
 
-	- CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
-	- '-o newinstance' mount option is specified while mounting devpts
-
-IOW, devpts now supports both single-instance and multi-instance semantics.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
-this referred to as the "legacy" mode. In this mode, the new mount options
-(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
-on console.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
-'newinstance' option (as in current start-up scripts) the new mount binds
-to the initial kernel mount of devpts. This mode is referred to as the
-'single-instance' mode and the current, single-instance semantics are
-preserved, i.e PTYs are common across the system.
-
-The only difference between this single-instance mode and the legacy mode
-is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
-can safely be ignored.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
-the mount is considered to be in the multi-instance mode and a new instance
-of the devpts fs is created. Any ptys created in this instance are independent
-of ptys in other instances of devpts. Like in the single-instance mode, the
-/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
-open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
-bind-mount.
-
-Eg: A container startup script could do the following:
-
-	$ chmod 0666 /dev/pts/ptmx
-	$ rm /dev/ptmx
-	$ ln -s pts/ptmx /dev/ptmx
-	$ ns_exec -cm /bin/bash
-
-	# We are now in new container
-
-	$ umount /dev/pts
-	$ mount -t devpts -o newinstance lxcpts /dev/pts
-	$ sshd -p 1234
-
-where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs
-/bin/bash in the child process.  A pty created by the sshd is not visible in
-the original mount of /dev/pts.
+As an option instead of placing a /dev/ptmx device node at /dev/ptmx
+it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or
+to bind mount /dev/ptx/ptmx to /dev/ptmx.  If you opt for using
+the devpts filesystem in this manner devpts should be mounted with
+the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called.
 
 Total count of pty pairs in all instances is limited by sysctls:
 kernel.pty.max = 4096		- global limit
-kernel.pty.reserve = 1024	- reserve for initial instance
+kernel.pty.reserve = 1024	- reserved for filesystems mounted from the initial mount namespace
 kernel.pty.nr			- current count of ptys
 
 Per-instance limit could be set by adding mount option "max=<count>".
 This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve.
 In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit.
-
-User-space changes
-------------------
-
-In multi-instance mode (i.e '-o newinstance' mount option is specified at least
-once), following user-space issues should be noted.
-
-1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
-   and no change is needed to system-startup scripts.
-
-2. To effectively use multi-instance mode (i.e -o newinstance is specified)
-   administrators or startup scripts should "redirect" open of /dev/ptmx to
-   /dev/pts/ptmx using either a bind mount or symlink.
-
-	$ mount -t devpts -o newinstance devpts /dev/pts
-
-   followed by either
-
-	$ rm /dev/ptmx
-	$ ln -s pts/ptmx /dev/ptmx
-	$ chmod 666 /dev/pts/ptmx
-   or
-	$ mount -o bind /dev/pts/ptmx /dev/ptmx
-
-3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
-   enables better error-reporting and treats both single-instance and
-   multi-instance mounts similarly.
-
-   But this method requires that system-startup scripts set the mode of
-   /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
-   mode by, either
-
-   	- adding ptmxmode mount option to devpts entry in /etc/fstab, or
-	- using 'chmod 0666 /dev/pts/ptmx'
-
-4. If multi-instance mode mount is needed for containers, but the system
-   startup scripts have not yet been updated, container-startup scripts
-   should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
-   instance mounts.
-
-   Or, in general, container-startup scripts should use:
-
-	mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts
-	if [ ! -L /dev/ptmx ]; then
-		mount -o bind /dev/pts/ptmx /dev/ptmx
-	fi
-
-   When all devpts mounts are multi-instance, /dev/ptmx can permanently be
-   a symlink to pts/ptmx and the bind mount can be ignored.
-
-5. A multi-instance mount that is not accompanied by the /dev/ptmx to
-   /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
-
-	mount -t devpts -o newinstance lxcpts /dev/pts
-
-   immediately followed by:
-
-	open("/dev/ptmx")
-
-    would create a pty, say /dev/pts/7, in the initial kernel mount.
-    But /dev/pts/7 would be invisible in the new mount.
-
-6. The permissions for /dev/pts/ptmx node should be specified when mounting
-   /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
-
-	mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
-
-   The permissions can be later be changed as usual with 'chmod'.
-
-	chmod 666 /dev/pts/ptmx
-
-7. A mount of devpts without the 'newinstance' option results in binding to
-   initial kernel mount.  This behavior while preserving legacy semantics,
-   does not provide strict isolation in a container environment. i.e by
-   mounting devpts without the 'newinstance' option, a container could
-   get visibility into the 'host' or root container's devpts.
-   
-   To workaround this and have strict isolation, all mounts of devpts,
-   including the mount in the root container, should use the newinstance
-   option.
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 82c4d2e45319..95103054c0e4 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -120,17 +120,6 @@ config UNIX98_PTYS
 	  All modern Linux systems use the Unix98 ptys.  Say Y unless
 	  you're on an embedded system and want to conserve memory.
 
-config DEVPTS_MULTIPLE_INSTANCES
-	bool "Support multiple instances of devpts"
-	depends on UNIX98_PTYS
-	default n
-	---help---
-	  Enable support for multiple instances of devpts filesystem.
-	  If you want to have isolated PTY namespaces (eg: in containers),
-	  say Y here.  Otherwise, say N. If enabled, each mount of devpts
-	  filesystem with the '-o newinstance' option will create an
-	  independent PTY namespace.
-
 config LEGACY_PTYS
 	bool "Legacy (BSD) PTY support"
 	default y
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index dd4b8417e7f4..f856c4544eea 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -668,7 +668,7 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
 	else
 		fsi = tty->link->driver_data;
 	devpts_kill_index(fsi, tty->index);
-	devpts_put_ref(fsi);
+	devpts_release(fsi);
 }
 
 static const struct tty_operations ptm_unix98_ops = {
@@ -733,10 +733,11 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	if (retval)
 		return retval;
 
-	fsi = devpts_get_ref(inode, filp);
-	retval = -ENODEV;
-	if (!fsi)
+	fsi = devpts_acquire(filp);
+	if (IS_ERR(fsi)) {
+		retval = PTR_ERR(fsi);
 		goto out_free_file;
+	}
 
 	/* find a device that is not in use. */
 	mutex_lock(&devpts_mutex);
@@ -745,7 +746,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 
 	retval = index;
 	if (index < 0)
-		goto out_put_ref;
+		goto out_put_fsi;
 
 
 	mutex_lock(&tty_mutex);
@@ -789,8 +790,8 @@ err_release:
 	return retval;
 out:
 	devpts_kill_index(fsi, index);
-out_put_ref:
-	devpts_put_ref(fsi);
+out_put_fsi:
+	devpts_release(fsi);
 out_free_file:
 	tty_free_file(filp);
 	return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0b2954d7172d..37c134a132c7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = {
 
 static DEFINE_MUTEX(allocated_ptys_lock);
 
-static struct vfsmount *devpts_mnt;
-
 struct pts_mount_opts {
 	int setuid;
 	int setgid;
@@ -104,7 +102,7 @@ struct pts_mount_opts {
 	kgid_t   gid;
 	umode_t mode;
 	umode_t ptmxmode;
-	int newinstance;
+	int reserve;
 	int max;
 };
 
@@ -117,11 +115,9 @@ static const match_table_t tokens = {
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 	{Opt_ptmxmode, "ptmxmode=%o"},
 	{Opt_newinstance, "newinstance"},
 	{Opt_max, "max=%d"},
-#endif
 	{Opt_err, NULL}
 };
 
@@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+struct pts_fs_info *devpts_acquire(struct file *filp)
 {
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
-		return inode->i_sb;
-#endif
-	if (!devpts_mnt)
-		return NULL;
-	return devpts_mnt->mnt_sb;
+	struct pts_fs_info *result;
+	struct path path;
+	struct super_block *sb;
+	int err;
+
+	path = filp->f_path;
+	path_get(&path);
+
+	/* Has the devpts filesystem already been found? */
+	sb = path.mnt->mnt_sb;
+	if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
+		/* Is a devpts filesystem at "pts" in the same directory? */
+		err = path_pts(&path);
+		if (err) {
+			result = ERR_PTR(err);
+			goto out;
+		}
+
+		/* Is the path the root of a devpts filesystem? */
+		result = ERR_PTR(-ENODEV);
+		sb = path.mnt->mnt_sb;
+		if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+		    (path.mnt->mnt_root != sb->s_root))
+			goto out;
+	}
+
+	/*
+	 * pty code needs to hold extra references in case of last /dev/tty close
+	 */
+	atomic_inc(&sb->s_active);
+	result = DEVPTS_SB(sb);
+
+out:
+	path_put(&path);
+	return result;
+}
+
+void devpts_release(struct pts_fs_info *fsi)
+{
+	deactivate_super(fsi->sb);
 }
 
 #define PARSE_MOUNT	0
@@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 /*
  * parse_mount_options():
  *	Set @opts to mount options specified in @data. If an option is not
- *	specified in @data, set it to its default value. The exception is
- *	'newinstance' option which can only be set/cleared on a mount (i.e.
- *	cannot be changed during remount).
+ *	specified in @data, set it to its default value.
  *
  * Note: @data may be NULL (in which case all options are set to default).
  */
@@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
 	opts->max     = NR_UNIX98_PTY_MAX;
 
-	/* newinstance makes sense only on initial mount */
+	/* Only allow instances mounted from the initial mount
+	 * namespace to tap the reserve pool of ptys.
+	 */
 	if (op == PARSE_MOUNT)
-		opts->newinstance = 0;
+		opts->reserve =
+			(current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
 
 	while ((p = strsep(&data, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
@@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 				return -EINVAL;
 			opts->mode = option & S_IALLUGO;
 			break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 		case Opt_ptmxmode:
 			if (match_octal(&args[0], &option))
 				return -EINVAL;
 			opts->ptmxmode = option & S_IALLUGO;
 			break;
 		case Opt_newinstance:
-			/* newinstance makes sense only on initial mount */
-			if (op == PARSE_MOUNT)
-				opts->newinstance = 1;
 			break;
 		case Opt_max:
 			if (match_int(&args[0], &option) ||
@@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 				return -EINVAL;
 			opts->max = option;
 			break;
-#endif
 		default:
 			pr_err("called with bogus options\n");
 			return -EINVAL;
@@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 	return 0;
 }
 
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 static int mknod_ptmx(struct super_block *sb)
 {
 	int mode;
@@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
 		inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
 	}
 }
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
-	return;
-}
-#endif
 
 static int devpts_remount(struct super_block *sb, int *flags, char *data)
 {
@@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",gid=%u",
 			   from_kgid_munged(&init_user_ns, opts->gid));
 	seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 	seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
 	if (opts->max < NR_UNIX98_PTY_MAX)
 		seq_printf(seq, ",max=%d", opts->max);
-#endif
 
 	return 0;
 }
@@ -410,40 +426,11 @@ fail:
 	return -ENOMEM;
 }
 
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
-	if (devpts_mnt)
-		return devpts_mnt->mnt_sb == s;
-	return 0;
-}
-
 /*
  * devpts_mount()
  *
- *     If the '-o newinstance' mount option was specified, mount a new
- *     (private) instance of devpts.  PTYs created in this instance are
- *     independent of the PTYs in other devpts instances.
- *
- *     If the '-o newinstance' option was not specified, mount/remount the
- *     initial kernel mount of devpts.  This type of mount gives the
- *     legacy, single-instance semantics.
- *
- *     The 'newinstance' option is needed to support multiple namespace
- *     semantics in devpts while preserving backward compatibility of the
- *     current 'single-namespace' semantics. i.e all mounts of devpts
- *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like mount_single().
- *
- *     Mounts with 'newinstance' option create a new, private namespace.
- *
- *     NOTE:
- *
- *     For single-mount semantics, devpts cannot use mount_single(),
- *     because mount_single()/sget() find and use the super-block from
- *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and mount_single() would pick the newinstance
- *     super-block instead of the initial super-block.
+ *     Mount a new (private) instance of devpts.  PTYs created in this
+ *     instance are independent of the PTYs in other devpts instances.
  */
 static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
@@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	if (error)
 		return ERR_PTR(error);
 
-	/* Require newinstance for all user namespace mounts to ensure
-	 * the mount options are not changed.
-	 */
-	if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
-		return ERR_PTR(-EINVAL);
-
-	if (opts.newinstance)
-		s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-	else
-		s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
-			 NULL);
-
+	s = sget(fs_type, NULL, set_anon_super, flags, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
@@ -491,18 +467,6 @@ out_undo_sget:
 	return ERR_PTR(error);
 }
 
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data)
-{
-	return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
 static void devpts_kill_sb(struct super_block *sb)
 {
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
 };
 
 /*
@@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi)
 	int index;
 	int ida_ret;
 
-	if (!fsi)
-		return -ENODEV;
-
 retry:
 	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
 		return -ENOMEM;
 
 	mutex_lock(&allocated_ptys_lock);
-	if (pty_count >= pty_limit -
-			(fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+	if (pty_count >= (pty_limit -
+			  (fsi->mount_opts.reserve ? 0 : pty_reserve))) {
 		mutex_unlock(&allocated_ptys_lock);
 		return -ENOSPC;
 	}
@@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx)
 	mutex_unlock(&allocated_ptys_lock);
 }
 
-/*
- * pty code needs to hold extra references in case of last /dev/tty close
- */
-struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file)
-{
-	struct super_block *sb;
-	struct pts_fs_info *fsi;
-
-	sb = pts_sb_from_inode(ptmx_inode);
-	if (!sb)
-		return NULL;
-	fsi = DEVPTS_SB(sb);
-	if (!fsi)
-		return NULL;
-
-	atomic_inc(&sb->s_active);
-	return fsi;
-}
-
-void devpts_put_ref(struct pts_fs_info *fsi)
-{
-	deactivate_super(fsi->sb);
-}
-
 /**
  * devpts_pty_new -- create a new inode in /dev/pts/
  * @ptmx_inode: inode of the master
@@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi)
 struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
 {
 	struct dentry *dentry;
-	struct super_block *sb;
+	struct super_block *sb = fsi->sb;
 	struct inode *inode;
 	struct dentry *root;
 	struct pts_mount_opts *opts;
 	char s[12];
 
-	if (!fsi)
-		return ERR_PTR(-ENODEV);
-
-	sb = fsi->sb;
 	root = sb->s_root;
 	opts = &fsi->mount_opts;
 
@@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry)
 static int __init init_devpts_fs(void)
 {
 	int err = register_filesystem(&devpts_fs_type);
-	struct ctl_table_header *table;
-
 	if (!err) {
-		struct vfsmount *mnt;
-
-		table = register_sysctl_table(pty_root_table);
-		mnt = kern_mount(&devpts_fs_type);
-		if (IS_ERR(mnt)) {
-			err = PTR_ERR(mnt);
-			unregister_filesystem(&devpts_fs_type);
-			unregister_sysctl_table(table);
-		} else {
-			devpts_mnt = mnt;
-		}
+		register_sysctl_table(pty_root_table);
 	}
 	return err;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 4c4f95ac8aa5..6a82fb7e2127 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1416,21 +1416,28 @@ static void follow_mount(struct path *path)
 	}
 }
 
+static int path_parent_directory(struct path *path)
+{
+	struct dentry *old = path->dentry;
+	/* rare case of legitimate dget_parent()... */
+	path->dentry = dget_parent(path->dentry);
+	dput(old);
+	if (unlikely(!path_connected(path)))
+		return -ENOENT;
+	return 0;
+}
+
 static int follow_dotdot(struct nameidata *nd)
 {
 	while(1) {
-		struct dentry *old = nd->path.dentry;
-
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			/* rare case of legitimate dget_parent()... */
-			nd->path.dentry = dget_parent(nd->path.dentry);
-			dput(old);
-			if (unlikely(!path_connected(&nd->path)))
-				return -ENOENT;
+			int ret = path_parent_directory(&nd->path);
+			if (ret)
+				return ret;
 			break;
 		}
 		if (!follow_up(&nd->path))
@@ -2514,6 +2521,34 @@ struct dentry *lookup_one_len_unlocked(const char *name,
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
 
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+	/* Find something mounted on "pts" in the same directory as
+	 * the input path.
+	 */
+	struct dentry *child, *parent;
+	struct qstr this;
+	int ret;
+
+	ret = path_parent_directory(path);
+	if (ret)
+		return ret;
+
+	parent = path->dentry;
+	this.name = "pts";
+	this.len = 3;
+	child = d_hash_and_lookup(parent, &this);
+	if (!child)
+		return -ENOENT;
+
+	path->dentry = child;
+	dput(parent);
+	follow_mount(path);
+	return 0;
+}
+#endif
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 5871f292b596..277ab9af9ac2 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -15,13 +15,12 @@
 
 #include <linux/errno.h>
 
-struct pts_fs_info;
-
 #ifdef CONFIG_UNIX98_PTYS
 
-/* Look up a pts fs info and get a ref to it */
-struct pts_fs_info *devpts_get_ref(struct inode *, struct file *);
-void devpts_put_ref(struct pts_fs_info *);
+struct pts_fs_info;
+
+struct pts_fs_info *devpts_acquire(struct file *);
+void devpts_release(struct pts_fs_info *);
 
 int devpts_new_index(struct pts_fs_info *);
 void devpts_kill_index(struct pts_fs_info *, int);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index ec5ec2818a28..d3d0398f2a1b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_EMPTY		0x4000
 
+extern int path_pts(struct path *path);
+
 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
 
 static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
-- 
cgit v1.2.3


From d3ae2bdeba9bad8cb95301451aeaf03ce31e82f0 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Sat, 4 Jun 2016 15:15:36 +0300
Subject: IB/mlx5: Fix pkey_index length in the QP path record

Pkey index fields in the QP context path record are extended to 16
bits, as required by IB spec (version 1.3).
This change affects all QP commands which include path records.

To enable this change, moved the free adaptive routing flag bit
(free_ar) to the most significant byte of the QP path record.

Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB ...')
Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 15 ++++++++-------
 include/linux/mlx5/qp.h         |  5 ++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 43c1441b6fb8..6b90bfdea830 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1859,7 +1859,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	int err;
 
 	if (attr_mask & IB_QP_PKEY_INDEX)
-		path->pkey_index = attr->pkey_index;
+		path->pkey_index = cpu_to_be16(attr->pkey_index);
 
 	if (ah->ah_flags & IB_AH_GRH) {
 		if (ah->grh.sgid_index >=
@@ -1879,9 +1879,9 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 							  ah->grh.sgid_index);
 		path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
 	} else {
-		path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
-		path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 :
-									0;
+		path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+		path->fl_free_ar |=
+			(path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0;
 		path->rlid = cpu_to_be16(ah->dlid);
 		path->grh_mlid = ah->src_path_bits & 0x7f;
 		if (ah->ah_flags & IB_AH_GRH)
@@ -2266,7 +2266,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
 
 	if (attr_mask & IB_QP_PKEY_INDEX)
-		context->pri_path.pkey_index = attr->pkey_index;
+		context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index);
 
 	/* todo implement counter_index functionality */
 
@@ -4015,11 +4015,12 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
 		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
 		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
-		qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f;
+		qp_attr->alt_pkey_index =
+			be16_to_cpu(context->alt_path.pkey_index);
 		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
 	}
 
-	qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f;
+	qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index);
 	qp_attr->port_num = context->pri_path.port;
 
 	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 64221027bf1f..e4e29882fdfd 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -460,10 +460,9 @@ struct mlx5_core_qp {
 };
 
 struct mlx5_qp_path {
-	u8			fl;
+	u8			fl_free_ar;
 	u8			rsvd3;
-	u8			free_ar;
-	u8			pkey_index;
+	__be16			pkey_index;
 	u8			rsvd0;
 	u8			grh_mlid;
 	__be16			rlid;
-- 
cgit v1.2.3


From 1607f09c226d1378439c411baaaa020042750338 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Sun, 5 Jun 2016 23:14:14 +0200
Subject: coredump: fix dumping through pipes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The offset in the core file used to be tracked with ->written field of
the coredump_params structure. The field was retired in favour of
file->f_pos.

However, ->f_pos is not maintained for pipes which leads to breakage.

Restore explicit tracking of the offset in coredump_params. Introduce
->pos field for this purpose since ->written was already reused.

Fixes: a00839395103 ("get rid of coredump_params->written").

Reported-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/coredump.c | 2 +-
 fs/binfmt_elf.c                              | 2 +-
 fs/binfmt_elf_fdpic.c                        | 2 +-
 fs/coredump.c                                | 4 +++-
 include/linux/binfmts.h                      | 1 +
 5 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 84fb984f29c1..85c85eb3e245 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -172,7 +172,7 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i,
 	if (rc < 0)
 		goto out;
 
-	skip = roundup(cprm->file->f_pos - total + sz, 4) - cprm->file->f_pos;
+	skip = roundup(cprm->pos - total + sz, 4) - cprm->pos;
 	if (!dump_skip(cprm, skip))
 		goto Eio;
 out:
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e158b22ef32f..a7a28110dc80 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2275,7 +2275,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		goto end_coredump;
 
 	/* Align to page */
-	if (!dump_skip(cprm, dataoff - cprm->file->f_pos))
+	if (!dump_skip(cprm, dataoff - cprm->pos))
 		goto end_coredump;
 
 	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 71ade0e556b7..203589311bf8 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				goto end_coredump;
 	}
 
-	if (!dump_skip(cprm, dataoff - cprm->file->f_pos))
+	if (!dump_skip(cprm, dataoff - cprm->pos))
 		goto end_coredump;
 
 	if (!elf_fdpic_dump_segments(cprm))
diff --git a/fs/coredump.c b/fs/coredump.c
index 38a7ab87e10a..281b768000e6 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -794,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 			return 0;
 		file->f_pos = pos;
 		cprm->written += n;
+		cprm->pos += n;
 		nr -= n;
 	}
 	return 1;
@@ -808,6 +809,7 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
 		if (dump_interrupted() ||
 		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
 			return 0;
+		cprm->pos += nr;
 		return 1;
 	} else {
 		while (nr > PAGE_SIZE) {
@@ -822,7 +824,7 @@ EXPORT_SYMBOL(dump_skip);
 
 int dump_align(struct coredump_params *cprm, int align)
 {
-	unsigned mod = cprm->file->f_pos & (align - 1);
+	unsigned mod = cprm->pos & (align - 1);
 	if (align & (align - 1))
 		return 0;
 	return mod ? dump_skip(cprm, align - mod) : 1;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 576e4639ca60..314b3caa701c 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -65,6 +65,7 @@ struct coredump_params {
 	unsigned long limit;
 	unsigned long mm_flags;
 	loff_t written;
+	loff_t pos;
 };
 
 /*
-- 
cgit v1.2.3


From 7cfe749fad5158247282f2fee30773fd454029ab Mon Sep 17 00:00:00 2001
From: Tony Makkiel <tony.makkiel@daqri.com>
Date: Wed, 18 May 2016 17:22:45 +0100
Subject: leds: core: Fix brightness setting upon hardware blinking enabled

Commit 76931edd54f8 ("leds: fix brightness changing when software blinking
is active") changed the semantics of led_set_brightness() which according
to the documentation should disable blinking upon any brightness setting.
Moreover it made it different for soft blink case, where it was possible
to change blink brightness, and for hardware blink case, where setting
any brightness greater than 0 was ignored.

While the change itself is against the documentation claims, it was driven
also by the fact that timer trigger remained active after turning blinking
off. Fixing that would have required major refactoring in the led-core,
led-class, and led-triggers because of cyclic dependencies.

Finally, it has been decided that allowing for brightness change during
blinking is beneficial as it can be accomplished without disturbing
blink rhythm.

The change in brightness setting semantics will not affect existing
LED class drivers that implement blink_set op thanks to the LED_BLINK_SW
flag introduced by this patch. The flag state will be from now on checked
in led_set_brightness() which will allow to distinguish between software
and hardware blink mode. In the latter case the control will be passed
directly to the drivers which apply their semantics on brightness set,
which is disable the blinking in case of most such drivers. New drivers
will apply new semantics and just change the brightness while hardware
blinking is on, if possible.

The issue was smuggled by subsequent LED core improvements, which modified
the code that originally introduced the problem.

Fixes: f1e80c07416a ("leds: core: Add two new LED_BLINK_ flags")
Signed-off-by: Tony Makkiel <tony.makkiel@daqri.com>
Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
---
 Documentation/leds/leds-class.txt |  4 ++--
 drivers/leds/led-core.c           |  9 ++++++---
 include/linux/leds.h              | 23 ++++++++++++-----------
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
index d406d98339b2..44f5e6bccd97 100644
--- a/Documentation/leds/leds-class.txt
+++ b/Documentation/leds/leds-class.txt
@@ -74,8 +74,8 @@ blink_set() function (see <linux/leds.h>). To set an LED to blinking,
 however, it is better to use the API function led_blink_set(), as it
 will check and implement software fallback if necessary.
 
-To turn off blinking again, use the API function led_brightness_set()
-as that will not just set the LED brightness but also stop any software
+To turn off blinking, use the API function led_brightness_set()
+with brightness value LED_OFF, which should stop any software
 timers that may have been required for blinking.
 
 The blink_set() function should choose a user friendly blinking value
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index 3495d5d6547f..3bce44893021 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -53,11 +53,12 @@ static void led_timer_function(unsigned long data)
 
 	if (!led_cdev->blink_delay_on || !led_cdev->blink_delay_off) {
 		led_set_brightness_nosleep(led_cdev, LED_OFF);
+		led_cdev->flags &= ~LED_BLINK_SW;
 		return;
 	}
 
 	if (led_cdev->flags & LED_BLINK_ONESHOT_STOP) {
-		led_cdev->flags &= ~LED_BLINK_ONESHOT_STOP;
+		led_cdev->flags &=  ~(LED_BLINK_ONESHOT_STOP | LED_BLINK_SW);
 		return;
 	}
 
@@ -151,6 +152,7 @@ static void led_set_software_blink(struct led_classdev *led_cdev,
 		return;
 	}
 
+	led_cdev->flags |= LED_BLINK_SW;
 	mod_timer(&led_cdev->blink_timer, jiffies + 1);
 }
 
@@ -219,6 +221,7 @@ void led_stop_software_blink(struct led_classdev *led_cdev)
 	del_timer_sync(&led_cdev->blink_timer);
 	led_cdev->blink_delay_on = 0;
 	led_cdev->blink_delay_off = 0;
+	led_cdev->flags &= ~LED_BLINK_SW;
 }
 EXPORT_SYMBOL_GPL(led_stop_software_blink);
 
@@ -226,10 +229,10 @@ void led_set_brightness(struct led_classdev *led_cdev,
 			enum led_brightness brightness)
 {
 	/*
-	 * In case blinking is on delay brightness setting
+	 * If software blink is active, delay brightness setting
 	 * until the next timer tick.
 	 */
-	if (led_cdev->blink_delay_on || led_cdev->blink_delay_off) {
+	if (led_cdev->flags & LED_BLINK_SW) {
 		/*
 		 * If we need to disable soft blinking delegate this to the
 		 * work queue task to avoid problems in case we are called
diff --git a/include/linux/leds.h b/include/linux/leds.h
index d2b13066e781..e5e7f2e80a54 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -42,15 +42,16 @@ struct led_classdev {
 #define LED_UNREGISTERING	(1 << 1)
 	/* Upper 16 bits reflect control information */
 #define LED_CORE_SUSPENDRESUME	(1 << 16)
-#define LED_BLINK_ONESHOT	(1 << 17)
-#define LED_BLINK_ONESHOT_STOP	(1 << 18)
-#define LED_BLINK_INVERT	(1 << 19)
-#define LED_BLINK_BRIGHTNESS_CHANGE (1 << 20)
-#define LED_BLINK_DISABLE	(1 << 21)
-#define LED_SYSFS_DISABLE	(1 << 22)
-#define LED_DEV_CAP_FLASH	(1 << 23)
-#define LED_HW_PLUGGABLE	(1 << 24)
-#define LED_PANIC_INDICATOR	(1 << 25)
+#define LED_BLINK_SW		(1 << 17)
+#define LED_BLINK_ONESHOT	(1 << 18)
+#define LED_BLINK_ONESHOT_STOP	(1 << 19)
+#define LED_BLINK_INVERT	(1 << 20)
+#define LED_BLINK_BRIGHTNESS_CHANGE (1 << 21)
+#define LED_BLINK_DISABLE	(1 << 22)
+#define LED_SYSFS_DISABLE	(1 << 23)
+#define LED_DEV_CAP_FLASH	(1 << 24)
+#define LED_HW_PLUGGABLE	(1 << 25)
+#define LED_PANIC_INDICATOR	(1 << 26)
 
 	/* Set LED brightness level
 	 * Must not sleep. Use brightness_set_blocking for drivers
@@ -72,8 +73,8 @@ struct led_classdev {
 	 * and if both are zero then a sensible default should be chosen.
 	 * The call should adjust the timings in that case and if it can't
 	 * match the values specified exactly.
-	 * Deactivate blinking again when the brightness is set to a fixed
-	 * value via the brightness_set() callback.
+	 * Deactivate blinking again when the brightness is set to LED_OFF
+	 * via the brightness_set() callback.
 	 */
 	int		(*blink_set)(struct led_classdev *led_cdev,
 				     unsigned long *delay_on,
-- 
cgit v1.2.3


From 5c1d3310d84309330264e2a06e5000eb289a44ad Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 7 Jun 2016 18:44:48 +0100
Subject: drivers: of: Fix of_pci.h header guard

The compilation of of_pci.c is governed by CONFIG_OF_PCI, but the
corresponding declarations in of_pci.h are inconsistently guarded by
CONFIG_OF, with the result that if CONFIG_PCI is disabled for an OF
platform, the dangling external declarations are still active and the
inline stub definitions not. So far this has managed to go unnoticed
since it happens that the only references to these functions are from
code which itself depends on CONFIG_PCI or CONFIG_OF_PCI.

Fix this with the appropriate config guard so that any new callers
outside PCI-specific code don't start unexpectedly breaking under
certain configs.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index f6e9e85164e8..b969e9443962 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -8,7 +8,7 @@ struct pci_dev;
 struct of_phandle_args;
 struct device_node;
 
-#ifdef CONFIG_OF
+#ifdef CONFIG_OF_PCI
 int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args *out_irq);
 struct device_node *of_pci_find_child_device(struct device_node *parent,
 					     unsigned int devfn);
-- 
cgit v1.2.3


From 06dfeef88573cf032e5c27e37f80ff5237b3318a Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Thu, 9 Jun 2016 11:38:34 +0100
Subject: drivers: of: add definition of
 early_init_dt_alloc_reserved_memory_arch

The function early_init_dt_alloc_reserved_memory_arch is defined
in drivers/of/of_reserved_mem.c but is not declared in any of the
header files. Add the declaration of this to avoid the warning:

drivers/of/of_reserved_mem.c:31:19: warning: symbol 'early_init_dt_alloc_reserved_memory_arch' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
[robh: drop extern from declaration]
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_reserved_mem.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h
index ad2f67054372..c201060e0c6d 100644
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -31,6 +31,13 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem);
 int of_reserved_mem_device_init(struct device *dev);
 void of_reserved_mem_device_release(struct device *dev);
 
+int early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
+					     phys_addr_t align,
+					     phys_addr_t start,
+					     phys_addr_t end,
+					     bool nomap,
+					     phys_addr_t *res_base);
+
 void fdt_init_reserved_mem(void);
 void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
 			       phys_addr_t base, phys_addr_t size);
-- 
cgit v1.2.3


From 418f8399a8bedf376ec13eb01088f04a76ebdd6f Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Fri, 10 Jun 2016 00:07:28 +0300
Subject: net/mlx5: Fix the size of modify QP mailbox

Add 16 reserved bytes at the end of mlx5_modify_qp_mbox_in to
match the hardware spec definition.

Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/qp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 64221027bf1f..1532dcf6fc5e 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -560,6 +560,7 @@ struct mlx5_modify_qp_mbox_in {
 	__be32			optparam;
 	u8			rsvd0[4];
 	struct mlx5_qp_context	ctx;
+	u8			rsvd2[16];
 };
 
 struct mlx5_modify_qp_mbox_out {
-- 
cgit v1.2.3


From 86d56a1a6b7352542661d8a9463758c7f285fce3 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Fri, 10 Jun 2016 00:07:30 +0300
Subject: net/mlx5: Fix MLX5_CMD_OP_MAX to be defined correctly

Having MLX5_CMD_OP_MAX on another file causes us to repeatedly miss
accounting new commands added to the driver and hence there're no entries
for them in debugfs. To solve that, we integrate it into the commands enum
as the last entry.

Fixes: 34a40e689393 ('net/mlx5_core: Introduce modify flow table command')
Signed-off-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   | 2 --
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 035abdf62cfe..51f0caf299d8 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1240,8 +1240,6 @@ struct mlx5_destroy_psv_out {
 	u8                      rsvd[8];
 };
 
-#define MLX5_CMD_OP_MAX 0x920
-
 enum {
 	VPORT_STATE_DOWN		= 0x0,
 	VPORT_STATE_UP			= 0x1,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 9a05cd7e5890..986a615f623c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -205,7 +205,8 @@ enum {
 	MLX5_CMD_OP_ALLOC_FLOW_COUNTER            = 0x939,
 	MLX5_CMD_OP_DEALLOC_FLOW_COUNTER          = 0x93a,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER            = 0x93b,
-	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c
+	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c,
+	MLX5_CMD_OP_MAX
 };
 
 struct mlx5_ifc_flow_table_fields_supported_bits {
-- 
cgit v1.2.3


From 876d634d19e41603aab91455f2c52a78a28372d5 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 10 Jun 2016 00:07:32 +0300
Subject: net/mlx5: Fix flow steering NIC capabilities check

Flow steering infrastructure is currently used only on link layer
ethernet, therefore the driver should initialize the flow steering
when the device link layer is ethernet.

In addition, add missing capability check before initializing the
namespace of NIC RX flow tables.

Fixes: 2530236303d9 ('net/mlx5_core: Flow steering tree initialization')
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 +++++++++-
 include/linux/mlx5/device.h                       |  6 ++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index fa6fec1930f5..c1efa5517d17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1767,6 +1767,9 @@ static void cleanup_root_ns(struct mlx5_core_dev *dev)
 
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 {
+	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return;
+
 	cleanup_root_ns(dev);
 	cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
 	cleanup_single_prio_root_ns(dev, dev->priv.esw_egress_root_ns);
@@ -1828,15 +1831,20 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 {
 	int err = 0;
 
+	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return 0;
+
 	err = mlx5_init_fc_stats(dev);
 	if (err)
 		return err;
 
-	if (MLX5_CAP_GEN(dev, nic_flow_table)) {
+	if (MLX5_CAP_GEN(dev, nic_flow_table) &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev, ft_support)) {
 		err = init_root_ns(dev);
 		if (err)
 			goto err;
 	}
+
 	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
 		err = init_fdb_root_ns(dev);
 		if (err)
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 51f0caf299d8..73a48479892d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1367,6 +1367,12 @@ enum mlx5_cap_type {
 #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
 	MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap)
 
+#define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap)
+
+#define MLX5_CAP_FLOWTABLE_NIC_RX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive.cap)
+
 #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
 		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
-- 
cgit v1.2.3


From 23898c763f4af6f5c80b0230b1ea788a0ce3cf73 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Fri, 10 Jun 2016 00:07:37 +0300
Subject: net/mlx5: E-Switch, Modify node guid on vf set MAC

In RoCE, the RDMA-CM needs the node guid to establish connection
between nodes.
Today, the node guid exposed to mlx5 Ethernet VFs is zero, therefore
RDMA-CM on the VF is broken.

Whenever the administrator sets a MAC for a VF, derive the node guid
from it and set it as well in the following way:
MAC: e4:1d:2d:b3:f4:01 -> node_guid: e4:1d:2d:ff:fe:b3:f4:01

Fixes: 77256579c6b43 ('net/mlx5: E-Switch, Introduce Vport...')
Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 23 ++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   | 38 +++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  9 ++++--
 include/linux/mlx5/vport.h                        |  2 ++
 4 files changed, 68 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index cfec20cffd26..9b1855b199a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1725,11 +1725,24 @@ void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe)
 	(esw && MLX5_CAP_GEN(esw->dev, vport_group_manager) && mlx5_core_is_pf(esw->dev))
 #define LEGAL_VPORT(esw, vport) (vport >= 0 && vport < esw->total_vports)
 
+static void node_guid_gen_from_mac(u64 *node_guid, u8 mac[ETH_ALEN])
+{
+	((u8 *)node_guid)[7] = mac[0];
+	((u8 *)node_guid)[6] = mac[1];
+	((u8 *)node_guid)[5] = mac[2];
+	((u8 *)node_guid)[4] = 0xff;
+	((u8 *)node_guid)[3] = 0xfe;
+	((u8 *)node_guid)[2] = mac[3];
+	((u8 *)node_guid)[1] = mac[4];
+	((u8 *)node_guid)[0] = mac[5];
+}
+
 int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 			       int vport, u8 mac[ETH_ALEN])
 {
-	int err = 0;
 	struct mlx5_vport *evport;
+	u64 node_guid;
+	int err = 0;
 
 	if (!ESW_ALLOWED(esw))
 		return -EPERM;
@@ -1753,11 +1766,17 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 		return err;
 	}
 
+	node_guid_gen_from_mac(&node_guid, mac);
+	err = mlx5_modify_nic_vport_node_guid(esw->dev, vport, node_guid);
+	if (err)
+		mlx5_core_warn(esw->dev,
+			       "Failed to set vport %d node guid, err = %d. RDMA_CM will not function properly for this VF.\n",
+			       vport, err);
+
 	mutex_lock(&esw->state_lock);
 	if (evport->enabled)
 		err = esw_vport_ingress_config(esw, evport);
 	mutex_unlock(&esw->state_lock);
-
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index b69dadcfb897..daf44cd4c566 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -508,6 +508,44 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
 
+int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				    u32 vport, u64 node_guid)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *nic_vport_context;
+	u8 *guid;
+	void *in;
+	int err;
+
+	if (!vport)
+		return -EINVAL;
+	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
+		return -EACCES;
+	if (!MLX5_CAP_ESW(mdev, nic_vport_node_guid_modify))
+		return -ENOTSUPP;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.node_guid, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, !!vport);
+
+	nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in,
+					 in, nic_vport_context);
+	guid = MLX5_ADDR_OF(nic_vport_context, nic_vport_context,
+			    node_guid);
+	MLX5_SET64(nic_vport_context, nic_vport_context, node_guid, node_guid);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
 					u16 *qkey_viol_cntr)
 {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 986a615f623c..e955a2859009 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -501,7 +501,9 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x1b];
+	u8         reserved_at_5[0x19];
+	u8         nic_vport_node_guid_modify[0x1];
+	u8         nic_vport_port_guid_modify[0x1];
 
 	u8         reserved_at_20[0x7e0];
 };
@@ -4584,7 +4586,10 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits {
 };
 
 struct mlx5_ifc_modify_nic_vport_field_select_bits {
-	u8         reserved_at_0[0x19];
+	u8         reserved_at_0[0x16];
+	u8         node_guid[0x1];
+	u8         port_guid[0x1];
+	u8         reserved_at_18[0x1];
 	u8         mtu[0x1];
 	u8         change_event[0x1];
 	u8         promisc[0x1];
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 301da4a5e6bf..6c16c198f680 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -50,6 +50,8 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
+int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				    u32 vport, u64 node_guid);
 int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
 					u16 *qkey_viol_cntr);
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
-- 
cgit v1.2.3


From ef2bf4997f7da6efa8540d9cf726c44bf2b863af Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Fri, 27 May 2016 09:45:49 -0700
Subject: pwm: Improve args checking in pwm_apply_state()

It seems like in the process of refactoring pwm_config() to utilize the
newly-introduced pwm_apply_state() API, some args/bounds checking was
dropped.

In particular, I noted that we are now allowing invalid period
selections, e.g.:

  # echo 1 > /sys/class/pwm/pwmchip0/export
  # cat /sys/class/pwm/pwmchip0/pwm1/period
  100
  # echo 101 > /sys/class/pwm/pwmchip0/pwm1/duty_cycle
  [... driver may or may not reject the value, or trigger some logic bug ...]

It's better to see:

  # echo 1 > /sys/class/pwm/pwmchip0/export
  # cat /sys/class/pwm/pwmchip0/pwm1/period
  100
  # echo 101 > /sys/class/pwm/pwmchip0/pwm1/duty_cycle
  -bash: echo: write error: Invalid argument

This patch reintroduces some bounds checks in both pwm_config() (for its
signed parameters; we don't want to convert negative values into large
unsigned values) and in pwm_apply_state() (which fix the above described
behavior, as well as other potential API misuses).

Fixes: 5ec803edcb70 ("pwm: Add core infrastructure to allow atomic updates")
Signed-off-by: Brian Norris <briannorris@chromium.org>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 3 ++-
 include/linux/pwm.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index dba3843c53b8..ed337a8c34ab 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -457,7 +457,8 @@ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state)
 {
 	int err;
 
-	if (!pwm)
+	if (!pwm || !state || !state->period ||
+	    state->duty_cycle > state->period)
 		return -EINVAL;
 
 	if (!memcmp(state, &pwm->state, sizeof(*state)))
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 17018f3c066e..908b67c847cd 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -235,6 +235,9 @@ static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
 	if (!pwm)
 		return -EINVAL;
 
+	if (duty_ns < 0 || period_ns < 0)
+		return -EINVAL;
+
 	pwm_get_state(pwm, &state);
 	if (state.duty_cycle == duty_ns && state.period == period_ns)
 		return 0;
-- 
cgit v1.2.3


From ba65dc5ef16f82fba77869cecf7a7d515f61446b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Jun 2016 11:32:47 -0400
Subject: much milder d_walk() race

d_walk() relies upon the tree not getting rearranged under it without
rename_lock being touched.  And we do grab rename_lock around the
places that change the tree topology.  Unfortunately, branch reordering
is just as bad from d_walk() POV and we have two places that do it
without touching rename_lock - one in handling of cursors (for ramfs-style
directories) and another in autofs.  autofs one is a separate story; this
commit deals with the cursors.
	* mark cursor dentries explicitly at allocation time
	* make __dentry_kill() leave ->d_child.next pointing to the next
non-cursor sibling, making sure that it won't be moved around unnoticed
before the parent is relocked on ascend-to-parent path in d_walk().
	* make d_walk() skip cursors explicitly; strictly speaking it's
not necessary (all callbacks we pass to d_walk() are no-ops on cursors),
but it makes analysis easier.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 58 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/internal.h          |  1 +
 fs/libfs.c             |  4 +---
 include/linux/dcache.h |  1 +
 4 files changed, 55 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 817c243c1ff1..b7eddfd35aa5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -507,6 +507,44 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
+static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+{
+	struct dentry *next;
+	/*
+	 * Inform d_walk() and shrink_dentry_list() that we are no longer
+	 * attached to the dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	if (unlikely(list_empty(&dentry->d_child)))
+		return;
+	__list_del_entry(&dentry->d_child);
+	/*
+	 * Cursors can move around the list of children.  While we'd been
+	 * a normal list member, it didn't matter - ->d_child.next would've
+	 * been updated.  However, from now on it won't be and for the
+	 * things like d_walk() it might end up with a nasty surprise.
+	 * Normally d_walk() doesn't care about cursors moving around -
+	 * ->d_lock on parent prevents that and since a cursor has no children
+	 * of its own, we get through it without ever unlocking the parent.
+	 * There is one exception, though - if we ascend from a child that
+	 * gets killed as soon as we unlock it, the next sibling is found
+	 * using the value left in its ->d_child.next.  And if _that_
+	 * pointed to a cursor, and cursor got moved (e.g. by lseek())
+	 * before d_walk() regains parent->d_lock, we'll end up skipping
+	 * everything the cursor had been moved past.
+	 *
+	 * Solution: make sure that the pointer left behind in ->d_child.next
+	 * points to something that won't be moving around.  I.e. skip the
+	 * cursors.
+	 */
+	while (dentry->d_child.next != &parent->d_subdirs) {
+		next = list_entry(dentry->d_child.next, struct dentry, d_child);
+		if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
+			break;
+		dentry->d_child.next = next->d_child.next;
+	}
+}
+
 static void __dentry_kill(struct dentry *dentry)
 {
 	struct dentry *parent = NULL;
@@ -532,12 +570,7 @@ static void __dentry_kill(struct dentry *dentry)
 	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	__list_del_entry(&dentry->d_child);
-	/*
-	 * Inform d_walk() that we are no longer attached to the
-	 * dentry tree
-	 */
-	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	dentry_unlist(dentry, parent);
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
@@ -1203,6 +1236,9 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
 
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
+			continue;
+
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
 		ret = enter(data, dentry);
@@ -1651,6 +1687,16 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 }
 EXPORT_SYMBOL(d_alloc);
 
+struct dentry *d_alloc_cursor(struct dentry * parent)
+{
+	struct dentry *dentry = __d_alloc(parent->d_sb, NULL);
+	if (dentry) {
+		dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR;
+		dentry->d_parent = dget(parent);
+	}
+	return dentry;
+}
+
 /**
  * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
  * @sb: the superblock
diff --git a/fs/internal.h b/fs/internal.h
index b71deeecea17..f57ced528cde 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -130,6 +130,7 @@ extern int invalidate_inodes(struct super_block *, bool);
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
+extern struct dentry *d_alloc_cursor(struct dentry *);
 
 /*
  * read_write.c
diff --git a/fs/libfs.c b/fs/libfs.c
index 3db2721144c2..cedeacbae303 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -71,9 +71,7 @@ EXPORT_SYMBOL(simple_lookup);
 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
-	static struct qstr cursor_name = QSTR_INIT(".", 1);
-
-	file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
+	file->private_data = d_alloc_cursor(file->f_path.dentry);
 
 	return file->private_data ? 0 : -ENOMEM;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 484c8792da82..bcd0c64e3ed8 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -212,6 +212,7 @@ struct dentry_operations {
 #define DCACHE_OP_REAL			0x08000000
 
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
+#define DCACHE_DENTRY_CURSOR		0x20000000
 
 extern seqlock_t rename_lock;
 
-- 
cgit v1.2.3


From d50039ea5ee63c589b0434baa5ecf6e5075bb6f9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 16 May 2016 17:03:42 -0400
Subject: nfsd4/rpc: move backchannel create logic into rpc code

Also simplify the logic a bit.

Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: Trond Myklebust <trondmy@primarydata.com>
---
 fs/nfsd/nfs4callback.c      | 18 +-----------------
 include/linux/sunrpc/clnt.h |  2 --
 net/sunrpc/clnt.c           | 12 ++++++++++--
 3 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7389cb1d7409..04c68d900324 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -710,22 +710,6 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
 	}
 }
 
-static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
-{
-	struct rpc_xprt *xprt;
-
-	if (args->protocol != XPRT_TRANSPORT_BC_TCP)
-		return rpc_create(args);
-
-	xprt = args->bc_xprt->xpt_bc_xprt;
-	if (xprt) {
-		xprt_get(xprt);
-		return rpc_create_xprt(args, xprt);
-	}
-
-	return rpc_create(args);
-}
-
 static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
 	int maxtime = max_cb_time(clp->net);
@@ -768,7 +752,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
 		args.authflavor = ses->se_cb_sec.flavor;
 	}
 	/* Create RPC client */
-	client = create_backchannel_client(&args);
+	client = rpc_create(&args);
 	if (IS_ERR(client)) {
 		dprintk("NFSD: couldn't create callback client: %ld\n",
 			PTR_ERR(client));
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 19c659d1c0f8..b6810c92b8bb 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -137,8 +137,6 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT	(1UL << 9)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
-struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
-					struct rpc_xprt *xprt);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 				const struct rpc_program *, u32);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 173c5dd2d751..b33721d41adc 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -446,7 +446,7 @@ out_no_rpciod:
 	return ERR_PTR(err);
 }
 
-struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 					struct rpc_xprt *xprt)
 {
 	struct rpc_clnt *clnt = NULL;
@@ -484,7 +484,6 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 
 	return clnt;
 }
-EXPORT_SYMBOL_GPL(rpc_create_xprt);
 
 /**
  * rpc_create - create an RPC client and transport with one call
@@ -510,6 +509,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	};
 	char servername[48];
 
+	if (args->bc_xprt) {
+		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		xprt = args->bc_xprt->xpt_bc_xprt;
+		if (xprt) {
+			xprt_get(xprt);
+			return rpc_create_xprt(args, xprt);
+		}
+	}
+
 	if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
 		xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
 	if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
-- 
cgit v1.2.3


From 39a9beab5acb83176e8b9a4f0778749a09341f1f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 17 May 2016 12:38:21 -0400
Subject: rpc: share one xps between all backchannels

The spec allows backchannels for multiple clients to share the same tcp
connection.  When that happens, we need to use the same xprt for all of
them.  Similarly, we need the same xps.

This fixes list corruption introduced by the multipath code.

Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: Trond Myklebust <trondmy@primarydata.com>
---
 include/linux/sunrpc/svc_xprt.h |  1 +
 include/linux/sunrpc/xprt.h     |  1 +
 net/sunrpc/clnt.c               | 18 ++++++++++++++----
 net/sunrpc/svc_xprt.c           |  2 ++
 net/sunrpc/xprtsock.c           |  1 +
 5 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index b7dabc4baafd..79ba50856707 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -84,6 +84,7 @@ struct svc_xprt {
 
 	struct net		*xpt_net;
 	struct rpc_xprt		*xpt_bc_xprt;	/* NFSv4.1 backchannel */
+	struct rpc_xprt_switch	*xpt_bc_xps;	/* NFSv4.1 backchannel */
 };
 
 static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 5aa3834619a8..5e3e1b63dbb3 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -297,6 +297,7 @@ struct xprt_create {
 	size_t			addrlen;
 	const char		*servername;
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
+	struct rpc_xprt_switch	*bc_xps;
 	unsigned int		flags;
 };
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b33721d41adc..2808d550d273 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -452,10 +452,20 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 	struct rpc_clnt *clnt = NULL;
 	struct rpc_xprt_switch *xps;
 
-	xps = xprt_switch_alloc(xprt, GFP_KERNEL);
-	if (xps == NULL) {
-		xprt_put(xprt);
-		return ERR_PTR(-ENOMEM);
+	if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
+		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		xps = args->bc_xprt->xpt_bc_xps;
+		xprt_switch_get(xps);
+	} else {
+		xps = xprt_switch_alloc(xprt, GFP_KERNEL);
+		if (xps == NULL) {
+			xprt_put(xprt);
+			return ERR_PTR(-ENOMEM);
+		}
+		if (xprt->bc_xprt) {
+			xprt_switch_get(xps);
+			xprt->bc_xprt->xpt_bc_xps = xps;
+		}
 	}
 	clnt = rpc_new_client(args, xps, xprt, NULL);
 	if (IS_ERR(clnt))
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f5572e31d518..4f01f63102ee 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -136,6 +136,8 @@ static void svc_xprt_free(struct kref *kref)
 	/* See comment on corresponding get in xs_setup_bc_tcp(): */
 	if (xprt->xpt_bc_xprt)
 		xprt_put(xprt->xpt_bc_xprt);
+	if (xprt->xpt_bc_xps)
+		xprt_switch_put(xprt->xpt_bc_xps);
 	xprt->xpt_ops->xpo_free(xprt);
 	module_put(owner);
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2d3e0c42361e..7e2b2fa189c3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3057,6 +3057,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 		return xprt;
 
 	args->bc_xprt->xpt_bc_xprt = NULL;
+	args->bc_xprt->xpt_bc_xps = NULL;
 	xprt_put(xprt);
 	ret = ERR_PTR(-EINVAL);
 out_err:
-- 
cgit v1.2.3


From 3a4955111ad46a022f05b51f91306d864f989625 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 27 May 2016 18:08:27 -0400
Subject: isa: Allow ISA-style drivers on modern systems

Several modern devices, such as PC/104 cards, are expected to run on
modern systems via an ISA bus interface. Since ISA is a legacy interface
for most modern architectures, ISA support should remain disabled in
general. Support for ISA-style drivers should be enabled on a per driver
basis.

To allow ISA-style drivers on modern systems, this patch introduces the
ISA_BUS_API and ISA_BUS Kconfig options. The ISA bus driver will now
build conditionally on the ISA_BUS_API Kconfig option, which defaults to
the legacy ISA Kconfig option. The ISA_BUS Kconfig option allows the
ISA_BUS_API Kconfig option to be selected on architectures which do not
enable ISA (e.g. X86_64).

The ISA_BUS Kconfig option is currently only implemented for X86
architectures. Other architectures may have their own ISA_BUS Kconfig
options added as required.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/Kconfig          | 3 +++
 arch/x86/Kconfig      | 9 +++++++++
 drivers/base/Makefile | 2 +-
 include/linux/isa.h   | 2 +-
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index d794384a0404..e9734796531f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -606,6 +606,9 @@ config HAVE_ARCH_HASH
 	  file which provides platform-specific implementations of some
 	  functions in <linux/hash.h> or fs/namei.c.
 
+config ISA_BUS_API
+	def_bool ISA
+
 #
 # ABI hall of shame
 #
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0a7b885964ba..d9a94da0c29f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2439,6 +2439,15 @@ config PCI_CNB20LE_QUIRK
 
 source "drivers/pci/Kconfig"
 
+config ISA_BUS
+	bool "ISA-style bus support on modern systems" if EXPERT
+	select ISA_BUS_API
+	help
+	  Enables ISA-style drivers on modern systems. This is necessary to
+	  support PC/104 devices on X86_64 platforms.
+
+	  If unsure, say N.
+
 # x86_64 have no ISA slots, but can have ISA-style DMA.
 config ISA_DMA_API
 	bool "ISA-style DMA support" if (X86_64 && EXPERT)
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 6b2a84e7f2be..2609ba20b396 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_ISA)	+= isa.o
+obj-$(CONFIG_ISA_BUS_API)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
diff --git a/include/linux/isa.h b/include/linux/isa.h
index 5ab85281230b..384ab9b7d79a 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,7 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
-#ifdef CONFIG_ISA
+#ifdef CONFIG_ISA_BUS_API
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
 #else
-- 
cgit v1.2.3


From 5e25db870ec983be138b343a3d04c79a5c1f1703 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Mon, 9 May 2016 09:39:50 -0400
Subject: isa: Dummy isa_register_driver should return error code

The inline isa_register_driver stub simply allows compilation on systems
with CONFIG_ISA disabled; the dummy isa_register_driver does not
register an isa_driver at all. The inline isa_register_driver should
return -ENODEV to indicate lack of support when attempting to register
an isa_driver on such a system with CONFIG_ISA disabled.

Cc: Matthew Wilcox <matthew@wil.cx>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Ye Xiaolong
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/isa.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index 384ab9b7d79a..f2d0258414cf 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -6,6 +6,7 @@
 #define __LINUX_ISA_H
 
 #include <linux/device.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 
 struct isa_driver {
@@ -28,7 +29,7 @@ void isa_unregister_driver(struct isa_driver *);
 #else
 static inline int isa_register_driver(struct isa_driver *d, unsigned int i)
 {
-	return 0;
+	return -ENODEV;
 }
 
 static inline void isa_unregister_driver(struct isa_driver *d)
-- 
cgit v1.2.3


From c9b254955b9f8814966f5dabd34c39d0e0a2b437 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Wed, 22 Jun 2016 17:27:26 +0300
Subject: IB/mlx5: Fix post send fence logic

If the caller specified IB_SEND_FENCE in the send flags of the work
request and no previous work request stated that the successive one
should be fenced, the work request would be executed without a fence.
This could result in RDMA read or atomic operations failure due to a MR
being invalidated. Fix this by adding the mlx5 enumeration for fencing
RDMA/atomic operations and fix the logic to apply this.

Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')
Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 7 ++++---
 include/linux/mlx5/qp.h         | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index ce434228a5ea..ce0a7ab35a22 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3332,10 +3332,11 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr)
 			return MLX5_FENCE_MODE_SMALL_AND_FENCE;
 		else
 			return fence;
-
-	} else {
-		return 0;
+	} else if (unlikely(wr->send_flags & IB_SEND_FENCE)) {
+		return MLX5_FENCE_MODE_FENCE;
 	}
+
+	return 0;
 }
 
 static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index e4e29882fdfd..630f66a186b7 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -172,6 +172,7 @@ enum {
 enum {
 	MLX5_FENCE_MODE_NONE			= 0 << 5,
 	MLX5_FENCE_MODE_INITIATOR_SMALL		= 1 << 5,
+	MLX5_FENCE_MODE_FENCE			= 2 << 5,
 	MLX5_FENCE_MODE_STRONG_ORDERING		= 3 << 5,
 	MLX5_FENCE_MODE_SMALL_AND_FENCE		= 4 << 5,
 };
-- 
cgit v1.2.3


From 4c5ea0a9cd02d6aa8adc86e100b2a4cff8d614ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 21 Jun 2016 18:52:17 +0200
Subject: locking/static_key: Fix concurrent static_key_slow_inc()

The following scenario is possible:

    CPU 1                                   CPU 2
    static_key_slow_inc()
     atomic_inc_not_zero()
      -> key.enabled == 0, no increment
     jump_label_lock()
     atomic_inc_return()
      -> key.enabled == 1 now
                                            static_key_slow_inc()
                                             atomic_inc_not_zero()
                                              -> key.enabled == 1, inc to 2
                                             return
                                            ** static key is wrong!
     jump_label_update()
     jump_label_unlock()

Testing the static key at the point marked by (**) will follow the
wrong path for jumps that have not been patched yet.  This can
actually happen when creating many KVM virtual machines with userspace
LAPIC emulation; just run several copies of the following program:

    #include <fcntl.h>
    #include <unistd.h>
    #include <sys/ioctl.h>
    #include <linux/kvm.h>

    int main(void)
    {
        for (;;) {
            int kvmfd = open("/dev/kvm", O_RDONLY);
            int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
            close(ioctl(vmfd, KVM_CREATE_VCPU, 1));
            close(vmfd);
            close(kvmfd);
        }
        return 0;
    }

Every KVM_CREATE_VCPU ioctl will attempt a static_key_slow_inc() call.
The static key's purpose is to skip NULL pointer checks and indeed one
of the processes eventually dereferences NULL.

As explained in the commit that introduced the bug:

  706249c222f6 ("locking/static_keys: Rework update logic")

jump_label_update() needs key.enabled to be true.  The solution adopted
here is to temporarily make key.enabled == -1, and use go down the
slow path when key.enabled <= 0.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 706249c222f6 ("locking/static_keys: Rework update logic")
Link: http://lkml.kernel.org/r/1466527937-69798-1-git-send-email-pbonzini@redhat.com
[ Small stylistic edits to the changelog and the code. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 16 +++++++++++++---
 kernel/jump_label.c        | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 0536524bb9eb..68904469fba1 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -117,13 +117,18 @@ struct module;
 
 #include <linux/atomic.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 static inline int static_key_count(struct static_key *key)
 {
-	return atomic_read(&key->enabled);
+	/*
+	 * -1 means the first static_key_slow_inc() is in progress.
+	 *  static_key_enabled() must return true, so return 1 here.
+	 */
+	int n = atomic_read(&key->enabled);
+	return n >= 0 ? n : 1;
 }
 
-#ifdef HAVE_JUMP_LABEL
-
 #define JUMP_TYPE_FALSE	0UL
 #define JUMP_TYPE_TRUE	1UL
 #define JUMP_TYPE_MASK	1UL
@@ -162,6 +167,11 @@ extern void jump_label_apply_nops(struct module *mod);
 
 #else  /* !HAVE_JUMP_LABEL */
 
+static inline int static_key_count(struct static_key *key)
+{
+	return atomic_read(&key->enabled);
+}
+
 static __always_inline void jump_label_init(void)
 {
 	static_key_initialized = true;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..4b353e0be121 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
 
 void static_key_slow_inc(struct static_key *key)
 {
+	int v, v1;
+
 	STATIC_KEY_CHECK_USE();
-	if (atomic_inc_not_zero(&key->enabled))
-		return;
+
+	/*
+	 * Careful if we get concurrent static_key_slow_inc() calls;
+	 * later calls must wait for the first one to _finish_ the
+	 * jump_label_update() process.  At the same time, however,
+	 * the jump_label_update() call below wants to see
+	 * static_key_enabled(&key) for jumps to be updated properly.
+	 *
+	 * So give a special meaning to negative key->enabled: it sends
+	 * static_key_slow_inc() down the slow path, and it is non-zero
+	 * so it counts as "enabled" in jump_label_update().  Note that
+	 * atomic_inc_unless_negative() checks >= 0, so roll our own.
+	 */
+	for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+		v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+		if (likely(v1 == v))
+			return;
+	}
 
 	jump_label_lock();
-	if (atomic_inc_return(&key->enabled) == 1)
+	if (atomic_read(&key->enabled) == 0) {
+		atomic_set(&key->enabled, -1);
 		jump_label_update(key);
+		atomic_set(&key->enabled, 1);
+	} else {
+		atomic_inc(&key->enabled);
+	}
 	jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
+	/*
+	 * The negative count check is valid even when a negative
+	 * key->enabled is in use by static_key_slow_inc(); a
+	 * __static_key_slow_dec() before the first static_key_slow_inc()
+	 * returns is unbalanced, because all other static_key_slow_inc()
+	 * instances block while the update is in progress.
+	 */
 	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
-- 
cgit v1.2.3


From b235beea9e996a4d36fed6cfef4801a3e7d7a9a5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 24 Jun 2016 15:09:37 -0700
Subject: Clarify naming of thread info/stack allocators

We've had the thread info allocated together with the thread stack for
most architectures for a long time (since the thread_info was split off
from the task struct), but that is about to change.

But the patches that move the thread info to be off-stack (and a part of
the task struct instead) made it clear how confused the allocator and
freeing functions are.

Because the common case was that we share an allocation with the thread
stack and the thread_info, the two pointers were identical.  That
identity then meant that we would have things like

	ti = alloc_thread_info_node(tsk, node);
	...
	tsk->stack = ti;

which certainly _worked_ (since stack and thread_info have the same
value), but is rather confusing: why are we assigning a thread_info to
the stack? And if we move the thread_info away, the "confusing" code
just gets to be entirely bogus.

So remove all this confusion, and make it clear that we are doing the
stack allocation by renaming and clarifying the function names to be
about the stack.  The fact that the thread_info then shares the
allocation is an implementation detail, and not really about the
allocation itself.

This is a pure renaming and type fix: we pass in the same pointer, it's
just that we clarify what the pointer means.

The ia64 code that actually only has one single allocation (for all of
task_struct, thread_info and kernel thread stack) now looks a bit odd,
but since "tsk->stack" is actually not even used there, that oddity
doesn't matter.  It would be a separate thing to clean that up, I
intentionally left the ia64 changes as a pure brute-force renaming and
type change.

Acked-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                           |  4 +--
 arch/ia64/Kconfig                      |  2 +-
 arch/ia64/include/asm/thread_info.h    |  8 +++---
 arch/mn10300/include/asm/thread_info.h |  2 +-
 arch/mn10300/kernel/kgdb.c             |  3 +-
 arch/tile/include/asm/thread_info.h    |  2 +-
 arch/tile/kernel/process.c             |  3 +-
 include/linux/sched.h                  |  2 +-
 init/main.c                            |  4 +--
 kernel/fork.c                          | 50 +++++++++++++++++-----------------
 10 files changed, 41 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index e9734796531f..15996290fed4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -226,8 +226,8 @@ config ARCH_INIT_TASK
 config ARCH_TASK_STRUCT_ALLOCATOR
 	bool
 
-# Select if arch has its private alloc_thread_info() function
-config ARCH_THREAD_INFO_ALLOCATOR
+# Select if arch has its private alloc_thread_stack() function
+config ARCH_THREAD_STACK_ALLOCATOR
 	bool
 
 # Select if arch wants to size task_struct dynamically via arch_task_struct_size:
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index f80758cb7157..e109ee95e919 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -45,7 +45,7 @@ config IA64
 	select GENERIC_SMP_IDLE_THREAD
 	select ARCH_INIT_TASK
 	select ARCH_TASK_STRUCT_ALLOCATOR
-	select ARCH_THREAD_INFO_ALLOCATOR
+	select ARCH_THREAD_STACK_ALLOCATOR
 	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_TIME_VSYSCALL_OLD
 	select SYSCTL_ARCH_UNALIGN_NO_WARN
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index aa995b67c3f5..d1212b84fb83 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -48,15 +48,15 @@ struct thread_info {
 #ifndef ASM_OFFSETS_C
 /* how to get the thread information struct from C */
 #define current_thread_info()	((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define alloc_thread_info_node(tsk, node)	\
-		((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
+#define alloc_thread_stack_node(tsk, node)	\
+		((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
 #define task_thread_info(tsk)	((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
 #else
 #define current_thread_info()	((struct thread_info *) 0)
-#define alloc_thread_info_node(tsk, node)	((struct thread_info *) 0)
+#define alloc_thread_stack_node(tsk, node)	((unsigned long *) 0)
 #define task_thread_info(tsk)	((struct thread_info *) 0)
 #endif
-#define free_thread_info(ti)	/* nothing */
+#define free_thread_stack(ti)	/* nothing */
 #define task_stack_page(tsk)	((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index 4861a78c7160..f5f90bbf019d 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -115,7 +115,7 @@ static inline unsigned long current_stack_pointer(void)
 }
 
 #ifndef CONFIG_KGDB
-void arch_release_thread_info(struct thread_info *ti);
+void arch_release_thread_stack(unsigned long *stack);
 #endif
 #define get_thread_info(ti)	get_task_struct((ti)->task)
 #define put_thread_info(ti)	put_task_struct((ti)->task)
diff --git a/arch/mn10300/kernel/kgdb.c b/arch/mn10300/kernel/kgdb.c
index 99770823451a..2d7986c386fe 100644
--- a/arch/mn10300/kernel/kgdb.c
+++ b/arch/mn10300/kernel/kgdb.c
@@ -397,8 +397,9 @@ static bool kgdb_arch_undo_singlestep(struct pt_regs *regs)
  * single-step state is cleared.  At this point the breakpoints should have
  * been removed by __switch_to().
  */
-void arch_release_thread_info(struct thread_info *ti)
+void arch_release_thread_stack(unsigned long *stack)
 {
+	struct thread_info *ti = (void *)stack;
 	if (kgdb_sstep_thread == ti) {
 		kgdb_sstep_thread = NULL;
 
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 4b7cef9e94e0..c1467ac59ce6 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -78,7 +78,7 @@ struct thread_info {
 
 #ifndef __ASSEMBLY__
 
-void arch_release_thread_info(struct thread_info *info);
+void arch_release_thread_stack(unsigned long *stack);
 
 /* How to get the thread information struct from C. */
 register unsigned long stack_pointer __asm__("sp");
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 6b705ccc9cc1..a465d8372edd 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -73,8 +73,9 @@ void arch_cpu_idle(void)
 /*
  * Release a thread_info structure
  */
-void arch_release_thread_info(struct thread_info *info)
+void arch_release_thread_stack(unsigned long *stack)
 {
+	struct thread_info *info = (void *)stack;
 	struct single_step_state *step_state = info->step_state;
 
 	if (step_state) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..253538f29ade 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3007,7 +3007,7 @@ static inline int object_is_on_stack(void *obj)
 	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
 }
 
-extern void thread_info_cache_init(void);
+extern void thread_stack_cache_init(void);
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
 static inline unsigned long stack_not_used(struct task_struct *p)
diff --git a/init/main.c b/init/main.c
index 4c17fda5c2ff..826fd57fa3aa 100644
--- a/init/main.c
+++ b/init/main.c
@@ -453,7 +453,7 @@ void __init __weak smp_setup_processor_id(void)
 }
 
 # if THREAD_SIZE >= PAGE_SIZE
-void __init __weak thread_info_cache_init(void)
+void __init __weak thread_stack_cache_init(void)
 {
 }
 #endif
@@ -627,7 +627,7 @@ asmlinkage __visible void __init start_kernel(void)
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 #endif
-	thread_info_cache_init();
+	thread_stack_cache_init();
 	cred_init();
 	fork_init();
 	proc_caches_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355aa97f..37b9439b8c07 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
 
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return page ? page_address(page) : NULL;
 }
 
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-	struct page *page = virt_to_page(ti);
+	struct page *page = virt_to_page(stack);
 
 	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 				    -(1 << THREAD_SIZE_ORDER));
 	__free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_info(struct thread_info *ti)
+static void free_stack(unsigned long *stack)
 {
-	kmem_cache_free(thread_info_cache, ti);
+	kmem_cache_free(thread_stack_cache, stack);
 }
 
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
 					      THREAD_SIZE, 0, NULL);
-	BUG_ON(thread_info_cache == NULL);
+	BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(ti));
+	struct zone *zone = page_zone(virt_to_page(stack));
 
 	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
 }
@@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
 	account_kernel_stack(tsk->stack, -1);
-	arch_release_thread_info(tsk->stack);
-	free_thread_info(tsk->stack);
+	arch_release_thread_stack(tsk->stack);
+	free_thread_stack(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
@@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
-	struct thread_info *ti;
+	unsigned long *stack;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!tsk)
 		return NULL;
 
-	ti = alloc_thread_info_node(tsk, node);
-	if (!ti)
+	stack = alloc_thread_stack_node(tsk, node);
+	if (!stack)
 		goto free_tsk;
 
 	err = arch_dup_task_struct(tsk, orig);
 	if (err)
-		goto free_ti;
+		goto free_stack;
 
-	tsk->stack = ti;
+	tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
@@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->task_frag.page = NULL;
 	tsk->wake_q.next = NULL;
 
-	account_kernel_stack(ti, 1);
+	account_kernel_stack(stack, 1);
 
 	kcov_task_init(tsk);
 
 	return tsk;
 
-free_ti:
-	free_thread_info(ti);
+free_stack:
+	free_thread_stack(stack);
 free_tsk:
 	free_task_struct(tsk);
 	return NULL;
-- 
cgit v1.2.3


From 7f1a00b6fcd0e3c19beba2e92d157dc0c2cf3494 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 24 Jun 2016 17:07:33 -0700
Subject: fix up initial thread stack pointer vs thread_info confusion

The INIT_TASK() initializer was similarly confused about the stack vs
thread_info allocation that the allocators had, and that were fixed in
commit b235beea9e99 ("Clarify naming of thread info/stack allocators").

The task ->stack pointer only incidentally ends up having the same value
as the thread_info, and in fact that will change.

So fix the initial task struct initializer to point to 'init_stack'
instead of 'init_thread_info', and make sure the ia64 definition for
that exists.

This actually makes the ia64 tsk->stack pointer be sensible for the
initial task, but not for any other task.  As mentioned in commit
b235beea9e99, that whole pointer isn't actually used on ia64, since
task_stack_page() there just points to the (single) allocation.

All the other architectures seem to have copied the 'init_stack'
definition, even if it tended to be generally unusued.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/init_task.c | 1 +
 include/linux/init_task.h    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index f9efe9739d3f..0eaa89f3defd 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -26,6 +26,7 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * handled. This is done by having a special ".data..init_task" section...
  */
 #define init_thread_info	init_task_mem.s.thread_info
+#define init_stack		init_task_mem.stack
 
 union {
 	struct {
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d45513d..f8834f820ec2 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -190,7 +190,7 @@ extern struct task_group root_task_group;
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
-	.stack		= &init_thread_info,				\
+	.stack		= init_stack,					\
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= PF_KTHREAD,					\
 	.prio		= MAX_PRIO-20,					\
-- 
cgit v1.2.3


From 9b75a867cc9ddbafcaf35029358ac500f2635ff3 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 24 Jun 2016 14:49:34 -0700
Subject: mm: mempool: kasan: don't poot mempool objects in quarantine

Currently we may put reserved by mempool elements into quarantine via
kasan_kfree().  This is totally wrong since quarantine may really free
these objects.  So when mempool will try to use such element,
use-after-free will happen.  Or mempool may decide that it no longer
need that element and double-free it.

So don't put object into quarantine in kasan_kfree(), just poison it.
Rename kasan_kfree() to kasan_poison_kfree() to respect that.

Also, we shouldn't use kasan_slab_alloc()/kasan_krealloc() in
kasan_unpoison_element() because those functions may update allocation
stacktrace.  This would be wrong for the most of the remove_element call
sites.

(The only call site where we may want to update alloc stacktrace is
 in mempool_alloc(). Kmemleak solves this by calling
 kmemleak_update_trace(), so we could make something like that too.
 But this is out of scope of this patch).

Fixes: 55834c59098d ("mm: kasan: initial memory quarantine implementation")
Link: http://lkml.kernel.org/r/575977C3.1010905@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Kuthonuzo Luruo <kuthonuzo.luruo@hpe.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 11 +++++++----
 mm/kasan/kasan.c      |  6 +++---
 mm/mempool.c          | 12 ++++--------
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 611927f5870d..ac4b3c46a84d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -59,14 +59,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object);
 
 void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
 void kasan_kfree_large(const void *ptr);
-void kasan_kfree(void *ptr);
+void kasan_poison_kfree(void *ptr);
 void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 		  gfp_t flags);
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
 bool kasan_slab_free(struct kmem_cache *s, void *object);
-void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -76,6 +75,9 @@ struct kasan_cache {
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
 
+size_t ksize(const void *);
+static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -102,7 +104,7 @@ static inline void kasan_poison_object_data(struct kmem_cache *cache,
 
 static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
 static inline void kasan_kfree_large(const void *ptr) {}
-static inline void kasan_kfree(void *ptr) {}
+static inline void kasan_poison_kfree(void *ptr) {}
 static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
 				size_t size, gfp_t flags) {}
 static inline void kasan_krealloc(const void *object, size_t new_size,
@@ -114,11 +116,12 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	return false;
 }
-static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
+static inline void kasan_unpoison_slab(const void *ptr) { }
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 28439acda6ec..6845f9294696 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -508,7 +508,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
+static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -626,7 +626,7 @@ void kasan_krealloc(const void *object, size_t size, gfp_t flags)
 		kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 
-void kasan_kfree(void *ptr)
+void kasan_poison_kfree(void *ptr)
 {
 	struct page *page;
 
@@ -636,7 +636,7 @@ void kasan_kfree(void *ptr)
 		kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
 				KASAN_FREE_PAGE);
 	else
-		kasan_slab_free(page->slab_cache, ptr);
+		kasan_poison_slab_free(page->slab_cache, ptr);
 }
 
 void kasan_kfree_large(const void *ptr)
diff --git a/mm/mempool.c b/mm/mempool.c
index 9e075f829d0d..8f65464da5de 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,20 +104,16 @@ static inline void poison_element(mempool_t *pool, void *element)
 
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
-	if (pool->alloc == mempool_alloc_slab)
-		kasan_poison_slab_free(pool->pool_data, element);
-	if (pool->alloc == mempool_kmalloc)
-		kasan_kfree(element);
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_poison_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
 
 static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
 {
-	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_alloc(pool->pool_data, element, flags);
-	if (pool->alloc == mempool_kmalloc)
-		kasan_krealloc(element, (size_t)pool->pool_data, flags);
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_unpoison_slab(element);
 	if (pool->alloc == mempool_alloc_pages)
 		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
 }
-- 
cgit v1.2.3


From 315d09bf30c2b436a1fdac86d31c24380cd56c4f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Jun 2016 14:49:45 -0700
Subject: Revert "mm: make faultaround produce old ptes"

This reverts commit 5c0a85fad949212b3e059692deecdeed74ae7ec7.

The commit causes ~6% regression in unixbench.

Let's revert it for now and consider other solution for reclaim problem
later.

Link: http://lkml.kernel.org/r/1465893750-44080-2-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/filemap.c       |  2 +-
 mm/memory.c        | 23 +++++------------------
 3 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5df5feb49575..ece042dfe23c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -602,7 +602,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon, bool old);
+		struct page *page, pte_t *pte, bool write, bool anon);
 #endif
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 00ae878b2a38..20f3b1f33f0e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2186,7 +2186,7 @@ repeat:
 		if (file->f_ra.mmap_miss > 0)
 			file->f_ra.mmap_miss--;
 		addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-		do_set_pte(vma, addr, page, pte, false, false, true);
+		do_set_pte(vma, addr, page, pte, false, false);
 		unlock_page(page);
 		goto next;
 unlock:
diff --git a/mm/memory.c b/mm/memory.c
index 15322b73636b..61fe7e7b56bf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2877,7 +2877,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
  * vm_ops->map_pages.
  */
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon, bool old)
+		struct page *page, pte_t *pte, bool write, bool anon)
 {
 	pte_t entry;
 
@@ -2885,8 +2885,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	if (old)
-		entry = pte_mkold(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address, false);
@@ -3032,20 +3030,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-		if (!pte_same(*pte, orig_pte))
-			goto unlock_out;
 		do_fault_around(vma, address, pte, pgoff, flags);
-		/* Check if the fault is handled by faultaround */
-		if (!pte_same(*pte, orig_pte)) {
-			/*
-			 * Faultaround produce old pte, but the pte we've
-			 * handler fault for should be young.
-			 */
-			pte_t entry = pte_mkyoung(*pte);
-			if (ptep_set_access_flags(vma, address, pte, entry, 0))
-				update_mmu_cache(vma, address, pte);
+		if (!pte_same(*pte, orig_pte))
 			goto unlock_out;
-		}
 		pte_unmap_unlock(pte, ptl);
 	}
 
@@ -3060,7 +3047,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, false, false, false);
+	do_set_pte(vma, address, fault_page, pte, false, false);
 	unlock_page(fault_page);
 unlock_out:
 	pte_unmap_unlock(pte, ptl);
@@ -3111,7 +3098,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		goto uncharge_out;
 	}
-	do_set_pte(vma, address, new_page, pte, true, true, false);
+	do_set_pte(vma, address, new_page, pte, true, true);
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
@@ -3164,7 +3151,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, true, false, false);
+	do_set_pte(vma, address, fault_page, pte, true, false);
 	pte_unmap_unlock(pte, ptl);
 
 	if (set_page_dirty(fault_page))
-- 
cgit v1.2.3