From 723c038475b78edc9327eb952f95f9881cc9d79d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 30 Oct 2016 11:42:02 -0500
Subject: fs: remove the never implemented aio_fsync file operation

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 16d2b6e874d6..ff7bcd9e8398 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1709,7 +1709,6 @@ struct file_operations {
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
-	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
-- 
cgit v1.2.3


From 70fe2f48152e60664809e2fed76bbb50c9fa2aa3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 30 Oct 2016 11:42:04 -0500
Subject: aio: fix freeze protection of aio writes

Currently we dropped freeze protection of aio writes just after IO was
submitted. Thus aio write could be in flight while the filesystem was
frozen and that could result in unexpected situation like aio completion
wanting to convert extent type on frozen filesystem. Testcase from
Dmitry triggering this is like:

for ((i=0;i<60;i++));do fsfreeze -f /mnt ;sleep 1;fsfreeze -u /mnt;done &
fio --bs=4k --ioengine=libaio --iodepth=128 --size=1g --direct=1 \
    --runtime=60 --filename=/mnt/file --name=rand-write --rw=randwrite

Fix the problem by dropping freeze protection only once IO is completed
in aio_complete().

Reported-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
[hch: forward ported on top of various VFS and aio changes]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c           | 19 ++++++++++++++++++-
 include/linux/fs.h |  1 +
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index c19755187ca5..428484f2f841 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1078,6 +1078,17 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
+	if (kiocb->ki_flags & IOCB_WRITE) {
+		struct file *file = kiocb->ki_filp;
+
+		/*
+		 * Tell lockdep we inherited freeze protection from submission
+		 * thread.
+		 */
+		__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		file_end_write(file);
+	}
+
 	/*
 	 * Special case handling for sync iocbs:
 	 *  - events go directly into the iocb for fast handling
@@ -1473,9 +1484,15 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
+		req->ki_flags |= IOCB_WRITE;
 		file_start_write(file);
 		ret = aio_ret(req, file->f_op->write_iter(req, &iter));
-		file_end_write(file);
+		/*
+		 * We release freeze protection in aio_complete().  Fool lockdep
+		 * by telling it the lock got released so that it doesn't
+		 * complain about held lock when we return to userspace.
+		 */
+		__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 	}
 	kfree(iovec);
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ff7bcd9e8398..dc0478c07b2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -321,6 +321,7 @@ struct writeback_control;
 #define IOCB_HIPRI		(1 << 3)
 #define IOCB_DSYNC		(1 << 4)
 #define IOCB_SYNC		(1 << 5)
+#define IOCB_WRITE		(1 << 6)
 
 struct kiocb {
 	struct file		*ki_filp;
-- 
cgit v1.2.3


From da96786e26c3ae47316db2b92046b11268c4379c Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 2 Nov 2016 12:08:25 -0700
Subject: net: tcp: check skb is non-NULL for exact match on lookups

Andrey reported the following error report while running the syzkaller
fuzzer:

general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 648 Comm: syz-executor Not tainted 4.9.0-rc3+ #333
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff8800398c4480 task.stack: ffff88003b468000
RIP: 0010:[<ffffffff83091106>]  [<     inline     >]
inet_exact_dif_match include/net/tcp.h:808
RIP: 0010:[<ffffffff83091106>]  [<ffffffff83091106>]
__inet_lookup_listener+0xb6/0x500 net/ipv4/inet_hashtables.c:219
RSP: 0018:ffff88003b46f270  EFLAGS: 00010202
RAX: 0000000000000004 RBX: 0000000000004242 RCX: 0000000000000001
RDX: 0000000000000000 RSI: ffffc90000e3c000 RDI: 0000000000000054
RBP: ffff88003b46f2d8 R08: 0000000000004000 R09: ffffffff830910e7
R10: 0000000000000000 R11: 000000000000000a R12: ffffffff867fa0c0
R13: 0000000000004242 R14: 0000000000000003 R15: dffffc0000000000
FS:  00007fb135881700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020cc3000 CR3: 000000006d56a000 CR4: 00000000000006f0
Stack:
 0000000000000000 000000000601a8c0 0000000000000000 ffffffff00004242
 424200003b9083c2 ffff88003def4041 ffffffff84e7e040 0000000000000246
 ffff88003a0911c0 0000000000000000 ffff88003a091298 ffff88003b9083ae
Call Trace:
 [<ffffffff831100f4>] tcp_v4_send_reset+0x584/0x1700 net/ipv4/tcp_ipv4.c:643
 [<ffffffff83115b1b>] tcp_v4_rcv+0x198b/0x2e50 net/ipv4/tcp_ipv4.c:1718
 [<ffffffff83069d22>] ip_local_deliver_finish+0x332/0xad0
net/ipv4/ip_input.c:216
...

MD5 has a code path that calls __inet_lookup_listener with a null skb,
so inet{6}_exact_dif_match needs to check skb against null before pulling
the flag.

Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if
       dif is l3mdev")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 2 +-
 include/net/tcp.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ca1ad9ebbc92..a0649973ee5b 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -149,7 +149,7 @@ static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb)
 {
 #if defined(CONFIG_NET_L3_MASTER_DEV)
 	if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
-	    ipv6_l3mdev_skb(IP6CB(skb)->flags))
+	    skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
 		return true;
 #endif
 	return false;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b82d4d94834..304a8e17bc87 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -805,7 +805,7 @@ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
 	if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
-	    ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
+	    skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
 		return true;
 #endif
 	return false;
-- 
cgit v1.2.3


From 98430c7aad6a3fdedcc78a0d6780dabb6580dc38 Mon Sep 17 00:00:00 2001
From: Randy Li <ayaka@soulik.info>
Date: Tue, 25 Oct 2016 22:15:34 +0800
Subject: phy: Add reset callback for not generic phy

Add a dummy function for phy_reset in case the CONFIG_GENERIC_PHY
is disabled.

Signed-off-by: Randy Li <ayaka@soulik.info>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 include/linux/phy/phy.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index ee1bed7dbfc6..78bb0d7f6b11 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -253,6 +253,13 @@ static inline int phy_set_mode(struct phy *phy, enum phy_mode mode)
 	return -ENOSYS;
 }
 
+static inline int phy_reset(struct phy *phy)
+{
+	if (!phy)
+		return 0;
+	return -ENOSYS;
+}
+
 static inline int phy_get_bus_width(struct phy *phy)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From 1571875beecd5de9657f73931449bda1b1329b6f Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Thu, 3 Nov 2016 16:21:26 +0200
Subject: ACPI / platform: Add support for build-in properties
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have a couple of drivers, acpi_apd.c and acpi_lpss.c,
that need to pass extra build-in properties to the devices
they create. Previously the drivers added those properties
to the struct device which is member of the struct
acpi_device, but that does not work. Those properties need
to be assigned to the struct device of the platform device
instead in order for them to become available to the
drivers.

To fix this, this patch changes acpi_create_platform_device
function to take struct property_entry pointer as parameter.

Fixes: 20a875e2e86e (serial: 8250_dw: Add quirk for APM X-Gene SoC)
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Tested-by: Jérôme de Bretagne <jerome.debretagne@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_apd.c             | 10 ++--------
 drivers/acpi/acpi_lpss.c            | 10 ++--------
 drivers/acpi/acpi_platform.c        |  5 ++++-
 drivers/acpi/dptf/int340x_thermal.c |  4 ++--
 drivers/acpi/scan.c                 |  2 +-
 drivers/platform/x86/intel-hid.c    |  2 +-
 drivers/platform/x86/intel-vbtn.c   |  2 +-
 include/linux/acpi.h                |  3 ++-
 8 files changed, 15 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c
index 5f112d811e42..5959ed996a75 100644
--- a/drivers/acpi/acpi_apd.c
+++ b/drivers/acpi/acpi_apd.c
@@ -117,7 +117,7 @@ static int acpi_apd_create_device(struct acpi_device *adev,
 	int ret;
 
 	if (!dev_desc) {
-		pdev = acpi_create_platform_device(adev);
+		pdev = acpi_create_platform_device(adev, NULL);
 		return IS_ERR_OR_NULL(pdev) ? PTR_ERR(pdev) : 1;
 	}
 
@@ -134,14 +134,8 @@ static int acpi_apd_create_device(struct acpi_device *adev,
 			goto err_out;
 	}
 
-	if (dev_desc->properties) {
-		ret = device_add_properties(&adev->dev, dev_desc->properties);
-		if (ret)
-			goto err_out;
-	}
-
 	adev->driver_data = pdata;
-	pdev = acpi_create_platform_device(adev);
+	pdev = acpi_create_platform_device(adev, dev_desc->properties);
 	if (!IS_ERR_OR_NULL(pdev))
 		return 1;
 
diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 552010288135..373657f7e35a 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -395,7 +395,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 
 	dev_desc = (const struct lpss_device_desc *)id->driver_data;
 	if (!dev_desc) {
-		pdev = acpi_create_platform_device(adev);
+		pdev = acpi_create_platform_device(adev, NULL);
 		return IS_ERR_OR_NULL(pdev) ? PTR_ERR(pdev) : 1;
 	}
 	pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
@@ -451,14 +451,8 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 		goto err_out;
 	}
 
-	if (dev_desc->properties) {
-		ret = device_add_properties(&adev->dev, dev_desc->properties);
-		if (ret)
-			goto err_out;
-	}
-
 	adev->driver_data = pdata;
-	pdev = acpi_create_platform_device(adev);
+	pdev = acpi_create_platform_device(adev, dev_desc->properties);
 	if (!IS_ERR_OR_NULL(pdev)) {
 		return 1;
 	}
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 159f7f19abce..56e70da83c6c 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -33,6 +33,7 @@ static const struct acpi_device_id forbidden_id_list[] = {
 /**
  * acpi_create_platform_device - Create platform device for ACPI device node
  * @adev: ACPI device node to create a platform device for.
+ * @properties: Optional collection of build-in properties.
  *
  * Check if the given @adev can be represented as a platform device and, if
  * that's the case, create and register a platform device, populate its common
@@ -40,7 +41,8 @@ static const struct acpi_device_id forbidden_id_list[] = {
  *
  * Name of the platform device will be the same as @adev's.
  */
-struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
+struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
+					struct property_entry *properties)
 {
 	struct platform_device *pdev = NULL;
 	struct platform_device_info pdevinfo;
@@ -88,6 +90,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	pdevinfo.res = resources;
 	pdevinfo.num_res = count;
 	pdevinfo.fwnode = acpi_fwnode_handle(adev);
+	pdevinfo.properties = properties;
 
 	if (acpi_dma_supported(adev))
 		pdevinfo.dma_mask = DMA_BIT_MASK(32);
diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c
index 33505c651f62..86364097e236 100644
--- a/drivers/acpi/dptf/int340x_thermal.c
+++ b/drivers/acpi/dptf/int340x_thermal.c
@@ -34,11 +34,11 @@ static int int340x_thermal_handler_attach(struct acpi_device *adev,
 					const struct acpi_device_id *id)
 {
 	if (IS_ENABLED(CONFIG_INT340X_THERMAL))
-		acpi_create_platform_device(adev);
+		acpi_create_platform_device(adev, NULL);
 	/* Intel SoC DTS thermal driver needs INT3401 to set IRQ descriptor */
 	else if (IS_ENABLED(CONFIG_INTEL_SOC_DTS_THERMAL) &&
 		 id->driver_data == INT3401_DEVICE)
-		acpi_create_platform_device(adev);
+		acpi_create_platform_device(adev, NULL);
 	return 1;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index ad9fc84a8601..3d31ae3a482d 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1734,7 +1734,7 @@ static void acpi_default_enumeration(struct acpi_device *device)
 			       &is_spi_i2c_slave);
 	acpi_dev_free_resource_list(&resource_list);
 	if (!is_spi_i2c_slave) {
-		acpi_create_platform_device(device);
+		acpi_create_platform_device(device, NULL);
 		acpi_device_set_enumerated(device);
 	} else {
 		blocking_notifier_call_chain(&acpi_reconfig_chain,
diff --git a/drivers/platform/x86/intel-hid.c b/drivers/platform/x86/intel-hid.c
index ed5874217ee7..12dbb5063376 100644
--- a/drivers/platform/x86/intel-hid.c
+++ b/drivers/platform/x86/intel-hid.c
@@ -264,7 +264,7 @@ check_acpi_dev(acpi_handle handle, u32 lvl, void *context, void **rv)
 		return AE_OK;
 
 	if (acpi_match_device_ids(dev, ids) == 0)
-		if (acpi_create_platform_device(dev))
+		if (acpi_create_platform_device(dev, NULL))
 			dev_info(&dev->dev,
 				 "intel-hid: created platform device\n");
 
diff --git a/drivers/platform/x86/intel-vbtn.c b/drivers/platform/x86/intel-vbtn.c
index 146d02f8c9bc..78080763df51 100644
--- a/drivers/platform/x86/intel-vbtn.c
+++ b/drivers/platform/x86/intel-vbtn.c
@@ -164,7 +164,7 @@ check_acpi_dev(acpi_handle handle, u32 lvl, void *context, void **rv)
 		return AE_OK;
 
 	if (acpi_match_device_ids(dev, ids) == 0)
-		if (acpi_create_platform_device(dev))
+		if (acpi_create_platform_device(dev, NULL))
 			dev_info(&dev->dev,
 				 "intel-vbtn: created platform device\n");
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 632ec16a855e..c09936f55166 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -546,7 +546,8 @@ int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
 int acpi_device_modalias(struct device *, char *, int);
 void acpi_walk_dep_device_list(acpi_handle handle);
 
-struct platform_device *acpi_create_platform_device(struct acpi_device *);
+struct platform_device *acpi_create_platform_device(struct acpi_device *,
+						    struct property_entry *);
 #define ACPI_PTR(_ptr)	(_ptr)
 
 static inline void acpi_device_set_enumerated(struct acpi_device *adev)
-- 
cgit v1.2.3


From 264048afab27d7c27eedf5394714e0b396d787f7 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 8 Nov 2016 15:15:24 +0100
Subject: libceph: initialize last_linger_id with a large integer

osdc->last_linger_id is a counter for lreq->linger_id, which is used
for watch cookies.  Starting with a large integer should ease the task
of telling apart kernel and userspace clients.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 2 ++
 net/ceph/osd_client.c           | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 96337b15a60d..a8e66344bacc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -258,6 +258,8 @@ struct ceph_watch_item {
 	struct ceph_entity_addr addr;
 };
 
+#define CEPH_LINGER_ID_START	0xffff000000000000ULL
+
 struct ceph_osd_client {
 	struct ceph_client     *client;
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d9bf7a1d0a58..e6ae15bc41b7 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4094,6 +4094,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osd_init(&osdc->homeless_osd);
 	osdc->homeless_osd.o_osdc = osdc;
 	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+	osdc->last_linger_id = CEPH_LINGER_ID_START;
 	osdc->linger_requests = RB_ROOT;
 	osdc->map_checks = RB_ROOT;
 	osdc->linger_map_checks = RB_ROOT;
-- 
cgit v1.2.3


From 5e322beefc8699b5747cfb35539a9496034e4296 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 10 Nov 2016 10:46:07 -0800
Subject: mm, frontswap: make sure allocated frontswap map is assigned

Christian Borntraeger reports:

With commit 8ea1d2a1985a ("mm, frontswap: convert frontswap_enabled to
static key") kmemleak complains about a memory leak in swapon

    unreferenced object 0x3e09ba56000 (size 32112640):
      comm "swapon", pid 7852, jiffies 4294968787 (age 1490.770s)
      hex dump (first 32 bytes):
        00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
        00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
      backtrace:
         __vmalloc_node_range+0x194/0x2d8
         vzalloc+0x58/0x68
         SyS_swapon+0xd60/0x12f8
         system_call+0xd6/0x270

Turns out kmemleak is right.  We now allocate the frontswap map
depending on the kernel config (and no longer on the enablement)

  swapfile.c:
  [...]
      if (IS_ENABLED(CONFIG_FRONTSWAP))
                frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));

but later on this is passed along
  --> enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);

and ignored if frontswap is disabled
  --> frontswap_init(p->type, frontswap_map);

  static inline void frontswap_init(unsigned type, unsigned long *map)
  {
        if (frontswap_enabled())
                __frontswap_init(type, map);
  }

Thing is, that frontswap map is never freed.

The leakage is relatively not that bad, because swapon is an infrequent
and privileged operation.  However, if the first frontswap backend is
registered after a swap type has been already enabled, it will WARN_ON
in frontswap_register_ops() and frontswap will not be available for the
swap type.

Fix this by making sure the map is assigned by frontswap_init() as long
as CONFIG_FRONTSWAP is enabled.

Fixes: 8ea1d2a1985a ("mm, frontswap: convert frontswap_enabled to static key")
Link: http://lkml.kernel.org/r/20161026134220.2566-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index c46d2aa16d81..1d18af034554 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -106,8 +106,9 @@ static inline void frontswap_invalidate_area(unsigned type)
 
 static inline void frontswap_init(unsigned type, unsigned long *map)
 {
-	if (frontswap_enabled())
-		__frontswap_init(type, map);
+#ifdef CONFIG_FRONTSWAP
+	__frontswap_init(type, map);
+#endif
 }
 
 #endif /* _LINUX_FRONTSWAP_H */
-- 
cgit v1.2.3


From c6c7d83b9c9e6a8b3e6d84c820ac61fbffc9e396 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 10 Nov 2016 10:46:26 -0800
Subject: Revert "console: don't prefer first registered if DT specifies
 stdout-path"

This reverts commit 05fd007e4629 ("console: don't prefer first
registered if DT specifies stdout-path").

The reverted commit changes existing behavior on which many ARM boards
rely.  Many ARM small-board-computers, like e.g.  the Raspberry Pi have
both a video output and a serial console.  Depending on whether the user
is using the device as a more regular computer; or as a headless device
we need to have the console on either one or the other.

Many users rely on the kernel behavior of the console being present on
both outputs, before the reverted commit the console setup with no
console= kernel arguments on an ARM board which sets stdout-path in dt
would look like this:

  [root@localhost ~]# cat /proc/consoles
  ttyS0                -W- (EC p a)    4:64
  tty0                 -WU (E  p  )    4:1

Where as after the reverted commit, it looks like this:

  [root@localhost ~]# cat /proc/consoles
  ttyS0                -W- (EC p a)    4:64

This commit reverts commit 05fd007e4629 ("console: don't prefer first
registered if DT specifies stdout-path") restoring the original
behavior.

Fixes: 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path")
Link: http://lkml.kernel.org/r/20161104121135.4780-2-hdegoede@redhat.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/of/base.c       |  2 --
 include/linux/console.h |  6 ------
 kernel/printk/printk.c  | 13 +------------
 3 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d687e6de24a0..a0bccb54a9bd 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2077,8 +2077,6 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
 			name = of_get_property(of_aliases, "stdout", NULL);
 		if (name)
 			of_stdout = of_find_node_opts_by_path(name, &of_stdout_options);
-		if (of_stdout)
-			console_set_by_of();
 	}
 
 	if (!of_aliases)
diff --git a/include/linux/console.h b/include/linux/console.h
index 3672809234a7..d530c4627e54 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -173,12 +173,6 @@ static inline void console_sysfs_notify(void)
 #endif
 extern bool console_suspend_enabled;
 
-#ifdef CONFIG_OF
-extern void console_set_by_of(void);
-#else
-static inline void console_set_by_of(void) {}
-#endif
-
 /* Suspend and resume console messages over PM events */
 extern void suspend_console(void);
 extern void resume_console(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de08fc90baaf..5028f4fd504a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -253,17 +253,6 @@ static int preferred_console = -1;
 int console_set_on_cmdline;
 EXPORT_SYMBOL(console_set_on_cmdline);
 
-#ifdef CONFIG_OF
-static bool of_specified_console;
-
-void console_set_by_of(void)
-{
-	of_specified_console = true;
-}
-#else
-# define of_specified_console false
-#endif
-
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
 
@@ -2657,7 +2646,7 @@ void register_console(struct console *newcon)
 	 *	didn't select a console we take the first one
 	 *	that registers here.
 	 */
-	if (preferred_console < 0 && !of_specified_console) {
+	if (preferred_console < 0) {
 		if (newcon->index < 0)
 			newcon->index = 0;
 		if (newcon->setup == NULL ||
-- 
cgit v1.2.3


From 4e3264d21b90984c2165e8fe5a7b64cf25bc2c2d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 9 Nov 2016 15:36:33 -0800
Subject: bpf: Fix bpf_redirect to an ipip/ip6tnl dev

If the bpf program calls bpf_redirect(dev, 0) and dev is
an ipip/ip6tnl, it currently includes the mac header.
e.g. If dev is ipip, the end result is IP-EthHdr-IP instead
of IP-IP.

The fix is to pull the mac header.  At ingress, skb_postpull_rcsum()
is not needed because the ethhdr should have been pulled once already
and then got pushed back just before calling the bpf_prog.
At egress, this patch calls skb_postpull_rcsum().

If bpf_redirect(dev, BPF_F_INGRESS) is called,
it also fails now because it calls dev_forward_skb() which
eventually calls eth_type_trans(skb, dev).  The eth_type_trans()
will set skb->type = PACKET_OTHERHOST because the mac address
does not match the redirecting dev->dev_addr.  The PACKET_OTHERHOST
will eventually cause the ip_rcv() errors out.  To fix this,
____dev_forward_skb() is added.

Joint work with Daniel Borkmann.

Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel")
Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 15 +++++++++++
 net/core/dev.c            | 17 +++++-------
 net/core/filter.c         | 68 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 81 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 91ee3643ccc8..bf04a46f6d5b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
 			const struct sk_buff *skb);
 
+static __always_inline int ____dev_forward_skb(struct net_device *dev,
+					       struct sk_buff *skb)
+{
+	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
+	    unlikely(!is_skb_forwardable(dev, skb))) {
+		atomic_long_inc(&dev->rx_dropped);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	skb_scrub_packet(skb, true);
+	skb->priority = 0;
+	return 0;
+}
+
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
 extern int		netdev_budget;
diff --git a/net/core/dev.c b/net/core/dev.c
index eaad4c28069f..6666b28b6815 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 {
-	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
-	    unlikely(!is_skb_forwardable(dev, skb))) {
-		atomic_long_inc(&dev->rx_dropped);
-		kfree_skb(skb);
-		return NET_RX_DROP;
-	}
+	int ret = ____dev_forward_skb(dev, skb);
 
-	skb_scrub_packet(skb, true);
-	skb->priority = 0;
-	skb->protocol = eth_type_trans(skb, dev);
-	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+	if (likely(!ret)) {
+		skb->protocol = eth_type_trans(skb, dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+	}
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__dev_forward_skb);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 00351cdf7d0c..b391209838ef 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 	return dev_forward_skb(dev, skb);
 }
 
+static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
+				      struct sk_buff *skb)
+{
+	int ret = ____dev_forward_skb(dev, skb);
+
+	if (likely(!ret)) {
+		skb->dev = dev;
+		ret = netif_rx(skb);
+	}
+
+	return ret;
+}
+
 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	int ret;
@@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 	return ret;
 }
 
+static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
+				 u32 flags)
+{
+	/* skb->mac_len is not set on normal egress */
+	unsigned int mlen = skb->network_header - skb->mac_header;
+
+	__skb_pull(skb, mlen);
+
+	/* At ingress, the mac header has already been pulled once.
+	 * At egress, skb_pospull_rcsum has to be done in case that
+	 * the skb is originated from ingress (i.e. a forwarded skb)
+	 * to ensure that rcsum starts at net header.
+	 */
+	if (!skb_at_tc_ingress(skb))
+		skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+	skb_pop_mac_header(skb);
+	skb_reset_mac_len(skb);
+	return flags & BPF_F_INGRESS ?
+	       __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
+				 u32 flags)
+{
+	bpf_push_mac_rcsum(skb);
+	return flags & BPF_F_INGRESS ?
+	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
+			  u32 flags)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return __bpf_redirect_no_mac(skb, dev, flags);
+	default:
+		return __bpf_redirect_common(skb, dev, flags);
+	}
+}
+
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
 	struct net_device *dev;
@@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 		return -ENOMEM;
 	}
 
-	bpf_push_mac_rcsum(clone);
-
-	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
+	return __bpf_redirect(clone, dev, flags);
 }
 
 static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb)
 		return -EINVAL;
 	}
 
-	bpf_push_mac_rcsum(skb);
-
-	return ri->flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	return __bpf_redirect(skb, dev, ri->flags);
 }
 
 static const struct bpf_func_proto bpf_redirect_proto = {
-- 
cgit v1.2.3


From ea08e39230e898844d9de5b60cdbb30067cebfe7 Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Fri, 11 Nov 2016 13:16:22 -0500
Subject: sunrpc: svc_age_temp_xprts_now should not call setsockopt non-tcp
 transports

This fixes the following panic that can occur with NFSoRDMA.

general protection fault: 0000 [#1] SMP
Modules linked in: rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi
scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp
scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm
mlx5_ib ib_core intel_powerclamp coretemp kvm_intel kvm sg ioatdma
ipmi_devintf ipmi_ssif dcdbas iTCO_wdt iTCO_vendor_support pcspkr
irqbypass sb_edac shpchp dca crc32_pclmul ghash_clmulni_intel edac_core
lpc_ich aesni_intel lrw gf128mul glue_helper ablk_helper mei_me mei
ipmi_si cryptd wmi ipmi_msghandler acpi_pad acpi_power_meter nfsd
auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod
crc_t10dif crct10dif_generic mgag200 i2c_algo_bit drm_kms_helper
syscopyarea sysfillrect sysimgblt ahci fb_sys_fops ttm libahci mlx5_core
tg3 crct10dif_pclmul drm crct10dif_common
ptp i2c_core libata crc32c_intel pps_core fjes dm_mirror dm_region_hash
dm_log dm_mod
CPU: 1 PID: 120 Comm: kworker/1:1 Not tainted 3.10.0-514.el7.x86_64 #1
Hardware name: Dell Inc. PowerEdge R320/0KM5PX, BIOS 2.4.2 01/29/2015
Workqueue: events check_lifetime
task: ffff88031f506dd0 ti: ffff88031f584000 task.ti: ffff88031f584000
RIP: 0010:[<ffffffff8168d847>]  [<ffffffff8168d847>]
_raw_spin_lock_bh+0x17/0x50
RSP: 0018:ffff88031f587ba8  EFLAGS: 00010206
RAX: 0000000000020000 RBX: 20041fac02080072 RCX: ffff88031f587fd8
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 20041fac02080072
RBP: ffff88031f587bb0 R08: 0000000000000008 R09: ffffffff8155be77
R10: ffff880322a59b00 R11: ffffea000bf39f00 R12: 20041fac02080072
R13: 000000000000000d R14: ffff8800c4fbd800 R15: 0000000000000001
FS:  0000000000000000(0000) GS:ffff880322a40000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f3c52d4547e CR3: 00000000019ba000 CR4: 00000000001407e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
20041fac02080002 ffff88031f587bd0 ffffffff81557830 20041fac02080002
ffff88031f587c78 ffff88031f587c40 ffffffff8155ae08 000000010157df32
0000000800000001 ffff88031f587c20 ffffffff81096acb ffffffff81aa37d0
Call Trace:
[<ffffffff81557830>] lock_sock_nested+0x20/0x50
[<ffffffff8155ae08>] sock_setsockopt+0x78/0x940
[<ffffffff81096acb>] ? lock_timer_base.isra.33+0x2b/0x50
[<ffffffff8155397d>] kernel_setsockopt+0x4d/0x50
[<ffffffffa0386284>] svc_age_temp_xprts_now+0x174/0x1e0 [sunrpc]
[<ffffffffa03b681d>] nfsd_inetaddr_event+0x9d/0xd0 [nfsd]
[<ffffffff81691ebc>] notifier_call_chain+0x4c/0x70
[<ffffffff810b687d>] __blocking_notifier_call_chain+0x4d/0x70
[<ffffffff810b68b6>] blocking_notifier_call_chain+0x16/0x20
[<ffffffff815e8538>] __inet_del_ifa+0x168/0x2d0
[<ffffffff815e8cef>] check_lifetime+0x25f/0x270
[<ffffffff810a7f3b>] process_one_work+0x17b/0x470
[<ffffffff810a8d76>] worker_thread+0x126/0x410
[<ffffffff810a8c50>] ? rescuer_thread+0x460/0x460
[<ffffffff810b052f>] kthread+0xcf/0xe0
[<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140
[<ffffffff81696418>] ret_from_fork+0x58/0x90
[<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140
Code: ca 75 f1 5d c3 0f 1f 80 00 00 00 00 eb d9 66 0f 1f 44 00 00 0f 1f
44 00 00 55 48 89 e5 53 48 89 fb e8 7e 04 a0 ff b8 00 00 02 00 <f0> 0f
c1 03 89 c2 c1 ea 10 66 39 c2 75 03 5b 5d c3 83 e2 fe 0f
RIP  [<ffffffff8168d847>] _raw_spin_lock_bh+0x17/0x50
RSP <ffff88031f587ba8>

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Fixes: c3d4879e ("sunrpc: Add a function to close temporary transports immediately")
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          |  1 +
 net/sunrpc/svc_xprt.c                    | 11 +----------
 net/sunrpc/svcsock.c                     | 21 +++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  6 ++++++
 4 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index ab02a457da1f..e5d193440374 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -25,6 +25,7 @@ struct svc_xprt_ops {
 	void		(*xpo_detach)(struct svc_xprt *);
 	void		(*xpo_free)(struct svc_xprt *);
 	int		(*xpo_secure_port)(struct svc_rqst *);
+	void		(*xpo_kill_temp_xprt)(struct svc_xprt *);
 };
 
 struct svc_xprt_class {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c3f652395a80..3bc1d61694cb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1002,14 +1002,8 @@ static void svc_age_temp_xprts(unsigned long closure)
 void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
 {
 	struct svc_xprt *xprt;
-	struct svc_sock *svsk;
-	struct socket *sock;
 	struct list_head *le, *next;
 	LIST_HEAD(to_be_closed);
-	struct linger no_linger = {
-		.l_onoff = 1,
-		.l_linger = 0,
-	};
 
 	spin_lock_bh(&serv->sv_lock);
 	list_for_each_safe(le, next, &serv->sv_tempsocks) {
@@ -1027,10 +1021,7 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
 		list_del_init(le);
 		xprt = list_entry(le, struct svc_xprt, xpt_list);
 		dprintk("svc_age_temp_xprts_now: closing %p\n", xprt);
-		svsk = container_of(xprt, struct svc_sock, sk_xprt);
-		sock = svsk->sk_sock;
-		kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-				  (char *)&no_linger, sizeof(no_linger));
+		xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
 		svc_close_xprt(xprt);
 	}
 }
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 57625f64efd5..a4bc98265d88 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -438,6 +438,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
 	return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
 }
 
+static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk;
+	struct socket *sock;
+	struct linger no_linger = {
+		.l_onoff = 1,
+		.l_linger = 0,
+	};
+
+	svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	sock = svsk->sk_sock;
+	kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			  (char *)&no_linger, sizeof(no_linger));
+}
+
 /*
  * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
  */
@@ -648,6 +663,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
 	return NULL;
 }
 
+static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
 static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
 				       struct net *net,
 				       struct sockaddr *sa, int salen,
@@ -667,6 +686,7 @@ static struct svc_xprt_ops svc_udp_ops = {
 	.xpo_has_wspace = svc_udp_has_wspace,
 	.xpo_accept = svc_udp_accept,
 	.xpo_secure_port = svc_sock_secure_port,
+	.xpo_kill_temp_xprt = svc_udp_kill_temp_xprt,
 };
 
 static struct svc_xprt_class svc_udp_class = {
@@ -1242,6 +1262,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
 	.xpo_has_wspace = svc_tcp_has_wspace,
 	.xpo_accept = svc_tcp_accept,
 	.xpo_secure_port = svc_sock_secure_port,
+	.xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt,
 };
 
 static struct svc_xprt_class svc_tcp_class = {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 6864fb967038..1334de2715c2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -67,6 +67,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt);
 static void svc_rdma_free(struct svc_xprt *xprt);
 static int svc_rdma_has_wspace(struct svc_xprt *xprt);
 static int svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
 
 static struct svc_xprt_ops svc_rdma_ops = {
 	.xpo_create = svc_rdma_create,
@@ -79,6 +80,7 @@ static struct svc_xprt_ops svc_rdma_ops = {
 	.xpo_has_wspace = svc_rdma_has_wspace,
 	.xpo_accept = svc_rdma_accept,
 	.xpo_secure_port = svc_rdma_secure_port,
+	.xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
 };
 
 struct svc_xprt_class svc_rdma_class = {
@@ -1317,6 +1319,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
 	return 1;
 }
 
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
 int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 {
 	struct ib_send_wr *bad_wr, *n_wr;
-- 
cgit v1.2.3


From f23cc643f9baec7f71f2b74692da3cf03abbbfda Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 14 Nov 2016 15:45:36 -0500
Subject: bpf: fix range arithmetic for bpf map access

I made some invalid assumptions with BPF_AND and BPF_MOD that could result in
invalid accesses to bpf map entries.  Fix this up by doing a few things

1) Kill BPF_MOD support.  This doesn't actually get used by the compiler in real
life and just adds extra complexity.

2) Fix the logic for BPF_AND, don't allow AND of negative numbers and set the
minimum value to 0 for positive AND's.

3) Don't do operations on the ranges if they are set to the limits, as they are
by definition undefined, and allowing arithmetic operations on those values
could make them appear valid when they really aren't.

This fixes the testcase provided by Jann as well as a few other theoretical
problems.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ++--
 kernel/bpf/verifier.c        | 70 +++++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b997aaa5..6aaf425cebc3 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -14,7 +14,7 @@
   * are obviously wrong for any sort of memory access.
   */
 #define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024)
-#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024)
+#define BPF_REGISTER_MIN_RANGE -1
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
@@ -22,7 +22,8 @@ struct bpf_reg_state {
 	 * Used to determine if any memory access using this register will
 	 * result in a bad access.
 	 */
-	u64 min_value, max_value;
+	s64 min_value;
+	u64 max_value;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99a7e5b388f2..6a936159c6e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				reg->map_ptr->key_size,
 				reg->map_ptr->value_size);
 		if (reg->min_value != BPF_REGISTER_MIN_RANGE)
-			verbose(",min_value=%llu",
-				(unsigned long long)reg->min_value);
+			verbose(",min_value=%lld",
+				(long long)reg->min_value);
 		if (reg->max_value != BPF_REGISTER_MAX_RANGE)
 			verbose(",max_value=%llu",
 				(unsigned long long)reg->max_value);
@@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			 * index'es we need to make sure that whatever we use
 			 * will have a set floor within our range.
 			 */
-			if ((s64)reg->min_value < 0) {
+			if (reg->min_value < 0) {
 				verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 					regno);
 				return -EACCES;
@@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
 {
 	if (reg->max_value > BPF_REGISTER_MAX_RANGE)
 		reg->max_value = BPF_REGISTER_MAX_RANGE;
-	if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE)
+	if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+	    reg->min_value > BPF_REGISTER_MAX_RANGE)
 		reg->min_value = BPF_REGISTER_MIN_RANGE;
 }
 
@@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				    struct bpf_insn *insn)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
-	u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
+	s64 min_val = BPF_REGISTER_MIN_RANGE;
+	u64 max_val = BPF_REGISTER_MAX_RANGE;
 	bool min_set = false, max_set = false;
 	u8 opcode = BPF_OP(insn->code);
 
@@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		return;
 	}
 
+	/* If one of our values was at the end of our ranges then we can't just
+	 * do our normal operations to the register, we need to set the values
+	 * to the min/max since they are undefined.
+	 */
+	if (min_val == BPF_REGISTER_MIN_RANGE)
+		dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+	if (max_val == BPF_REGISTER_MAX_RANGE)
+		dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+
 	switch (opcode) {
 	case BPF_ADD:
-		dst_reg->min_value += min_val;
-		dst_reg->max_value += max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value += min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value += max_val;
 		break;
 	case BPF_SUB:
-		dst_reg->min_value -= min_val;
-		dst_reg->max_value -= max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value -= min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value -= max_val;
 		break;
 	case BPF_MUL:
-		dst_reg->min_value *= min_val;
-		dst_reg->max_value *= max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value *= min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value *= max_val;
 		break;
 	case BPF_AND:
-		/* & is special since it could end up with 0 bits set. */
-		dst_reg->min_value &= min_val;
+		/* Disallow AND'ing of negative numbers, ain't nobody got time
+		 * for that.  Otherwise the minimum is 0 and the max is the max
+		 * value we could AND against.
+		 */
+		if (min_val < 0)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		else
+			dst_reg->min_value = 0;
 		dst_reg->max_value = max_val;
 		break;
 	case BPF_LSH:
@@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		else
+		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value <<= min_val;
 
 		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
-		else
+		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value <<= max_val;
 		break;
 	case BPF_RSH:
-		dst_reg->min_value >>= min_val;
-		dst_reg->max_value >>= max_val;
-		break;
-	case BPF_MOD:
-		/* % is special since it is an unsigned modulus, so the floor
-		 * will always be 0.
+		/* RSH by a negative number is undefined, and the BPF_RSH is an
+		 * unsigned shift, so make the appropriate casts.
 		 */
-		dst_reg->min_value = 0;
-		dst_reg->max_value = max_val - 1;
+		if (min_val < 0 || dst_reg->min_value < 0)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		else
+			dst_reg->min_value =
+				(u64)(dst_reg->min_value) >> min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value >>= max_val;
 		break;
 	default:
 		reset_reg_range_values(regs, insn->dst_reg);
-- 
cgit v1.2.3


From 5d1904204c99596b50a700f092fe49d78edba400 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Thu, 10 Nov 2016 17:16:33 +0800
Subject: mremap: fix race between mremap() and page cleanning

Prior to 3.15, there was a race between zap_pte_range() and
page_mkclean() where writes to a page could be lost.  Dave Hansen
discovered by inspection that there is a similar race between
move_ptes() and page_mkclean().

We've been able to reproduce the issue by enlarging the race window with
a msleep(), but have not been able to hit it without modifying the code.
So, we think it's a real issue, but is difficult or impossible to hit in
practice.

The zap_pte_range() issue is fixed by commit 1cf35d47712d("mm: split
'tlb_flush_mmu()' into tlb flushing and memory freeing parts").  And
this patch is to fix the race between page_mkclean() and mremap().

Here is one possible way to hit the race: suppose a process mmapped a
file with READ | WRITE and SHARED, it has two threads and they are bound
to 2 different CPUs, e.g.  CPU1 and CPU2.  mmap returned X, then thread
1 did a write to addr X so that CPU1 now has a writable TLB for addr X
on it.  Thread 2 starts mremaping from addr X to Y while thread 1
cleaned the page and then did another write to the old addr X again.
The 2nd write from thread 1 could succeed but the value will get lost.

        thread 1                           thread 2
     (bound to CPU1)                    (bound to CPU2)

  1: write 1 to addr X to get a
     writeable TLB on this CPU

                                        2: mremap starts

                                        3: move_ptes emptied PTE for addr X
                                           and setup new PTE for addr Y and
                                           then dropped PTL for X and Y

  4: page laundering for N by doing
     fadvise FADV_DONTNEED. When done,
     pageframe N is deemed clean.

  5: *write 2 to addr X

                                        6: tlb flush for addr X

  7: munmap (Y, pagesize) to make the
     page unmapped

  8: fadvise with FADV_DONTNEED again
     to kick the page off the pagecache

  9: pread the page from file to verify
     the value. If 1 is there, it means
     we have lost the written 2.

  *the write may or may not cause segmentation fault, it depends on
  if the TLB is still on the CPU.

Please note that this is only one specific way of how the race could
occur, it didn't mean that the race could only occur in exact the above
config, e.g. more than 2 threads could be involved and fadvise() could
be done in another thread, etc.

For anonymous pages, they could race between mremap() and page reclaim:
THP: a huge PMD is moved by mremap to a new huge PMD, then the new huge
PMD gets unmapped/splitted/pagedout before the flush tlb happened for
the old huge PMD in move_page_tables() and we could still write data to
it.  The normal anonymous page has similar situation.

To fix this, check for any dirty PTE in move_ptes()/move_huge_pmd() and
if any, did the flush before dropping the PTL.  If we did the flush for
every move_ptes()/move_huge_pmd() call then we do not need to do the
flush in move_pages_tables() for the whole range.  But if we didn't, we
still need to do the whole range flush.

Alternatively, we can track which part of the range is flushed in
move_ptes()/move_huge_pmd() and which didn't to avoid flushing the whole
range in move_page_tables().  But that would require multiple tlb
flushes for the different sub-ranges and should be less efficient than
the single whole range flush.

KBuild test on my Sandybridge desktop doesn't show any noticeable change.
v4.9-rc4:
  real    5m14.048s
  user    32m19.800s
  sys     4m50.320s

With this commit:
  real    5m13.888s
  user    32m19.330s
  sys     4m51.200s

Reported-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  2 +-
 mm/huge_memory.c        |  9 ++++++++-
 mm/mremap.c             | 30 +++++++++++++++++++++---------
 3 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9b9f65d99873..e35e6de633b9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -22,7 +22,7 @@ extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned char *vec);
 extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			 unsigned long new_addr, unsigned long old_end,
-			 pmd_t *old_pmd, pmd_t *new_pmd);
+			 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, pgprot_t newprot,
 			int prot_numa);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cdcd25cb30fe..eff3de359d50 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1426,11 +1426,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
-		  pmd_t *old_pmd, pmd_t *new_pmd)
+		  pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	pmd_t pmd;
 	struct mm_struct *mm = vma->vm_mm;
+	bool force_flush = false;
 
 	if ((old_addr & ~HPAGE_PMD_MASK) ||
 	    (new_addr & ~HPAGE_PMD_MASK) ||
@@ -1455,6 +1456,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		new_ptl = pmd_lockptr(mm, new_pmd);
 		if (new_ptl != old_ptl)
 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+		if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
+			force_flush = true;
 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
 
@@ -1467,6 +1470,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
 		if (new_ptl != old_ptl)
 			spin_unlock(new_ptl);
+		if (force_flush)
+			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+		else
+			*need_flush = true;
 		spin_unlock(old_ptl);
 		return true;
 	}
diff --git a/mm/mremap.c b/mm/mremap.c
index da22ad2a5678..6ccecc03f56a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long old_addr, unsigned long old_end,
 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
-		unsigned long new_addr, bool need_rmap_locks)
+		unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
+	bool force_flush = false;
+	unsigned long len = old_end - old_addr;
 
 	/*
 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -146,6 +148,14 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 				   new_pte++, new_addr += PAGE_SIZE) {
 		if (pte_none(*old_pte))
 			continue;
+
+		/*
+		 * We are remapping a dirty PTE, make sure to
+		 * flush TLB before we drop the PTL for the
+		 * old PTE or we may race with page_mkclean().
+		 */
+		if (pte_present(*old_pte) && pte_dirty(*old_pte))
+			force_flush = true;
 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		pte = move_soft_dirty_pte(pte);
@@ -156,6 +166,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	pte_unmap(new_pte - 1);
+	if (force_flush)
+		flush_tlb_range(vma, old_end - len, old_end);
+	else
+		*need_flush = true;
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (need_rmap_locks)
 		drop_rmap_locks(vma);
@@ -201,13 +215,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 				if (need_rmap_locks)
 					take_rmap_locks(vma);
 				moved = move_huge_pmd(vma, old_addr, new_addr,
-						    old_end, old_pmd, new_pmd);
+						    old_end, old_pmd, new_pmd,
+						    &need_flush);
 				if (need_rmap_locks)
 					drop_rmap_locks(vma);
-				if (moved) {
-					need_flush = true;
+				if (moved)
 					continue;
-				}
 			}
 			split_huge_pmd(vma, old_pmd, old_addr);
 			if (pmd_trans_unstable(old_pmd))
@@ -220,11 +233,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 			extent = next - new_addr;
 		if (extent > LATENCY_LIMIT)
 			extent = LATENCY_LIMIT;
-		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-			  new_vma, new_pmd, new_addr, need_rmap_locks);
-		need_flush = true;
+		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
+			  new_pmd, new_addr, need_rmap_locks, &need_flush);
 	}
-	if (likely(need_flush))
+	if (need_flush)
 		flush_tlb_range(vma, old_end-len, old_addr);
 
 	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
-- 
cgit v1.2.3


From 910170442944e1f8674fd5ddbeeb8ccd1877ea98 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 12 Sep 2016 10:49:11 +0800
Subject: iommu/vt-d: Fix PASID table allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Somehow I ended up with an off-by-three error in calculating the size of
the PASID and PASID State tables, which triggers allocations failures as
those tables unfortunately have to be physically contiguous.

In fact, even the *correct* maximum size of 8MiB is problematic and is
wont to lead to allocation failures. Since I have extracted a promise
that this *will* be fixed in hardware, I'm happy to limit it on the
current hardware to a maximum of 0x20000 PASIDs, which gives us 1MiB
tables — still not ideal, but better than before.

Reported by Mika Kuoppala <mika.kuoppala@linux.intel.com> and also by
Xunlei Pang <xlpang@redhat.com> who submitted a simpler patch to fix
only the allocation (and not the free) to the "correct" limit... which
was still problematic.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Cc: stable@vger.kernel.org
---
 drivers/iommu/intel-svm.c   | 28 +++++++++++++++++-----------
 include/linux/intel-iommu.h |  1 +
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 8ebb3530afa7..cb72e0011310 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -39,10 +39,18 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
 	struct page *pages;
 	int order;
 
-	order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT;
-	if (order < 0)
-		order = 0;
-
+	/* Start at 2 because it's defined as 2^(1+PSS) */
+	iommu->pasid_max = 2 << ecap_pss(iommu->ecap);
+
+	/* Eventually I'm promised we will get a multi-level PASID table
+	 * and it won't have to be physically contiguous. Until then,
+	 * limit the size because 8MiB contiguous allocations can be hard
+	 * to come by. The limit of 0x20000, which is 1MiB for each of
+	 * the PASID and PASID-state tables, is somewhat arbitrary. */
+	if (iommu->pasid_max > 0x20000)
+		iommu->pasid_max = 0x20000;
+
+	order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max);
 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 	if (!pages) {
 		pr_warn("IOMMU: %s: Failed to allocate PASID table\n",
@@ -53,6 +61,8 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
 	pr_info("%s: Allocated order %d PASID table.\n", iommu->name, order);
 
 	if (ecap_dis(iommu->ecap)) {
+		/* Just making it explicit... */
+		BUILD_BUG_ON(sizeof(struct pasid_entry) != sizeof(struct pasid_state_entry));
 		pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 		if (pages)
 			iommu->pasid_state_table = page_address(pages);
@@ -68,11 +78,7 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
 
 int intel_svm_free_pasid_tables(struct intel_iommu *iommu)
 {
-	int order;
-
-	order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT;
-	if (order < 0)
-		order = 0;
+	int order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max);
 
 	if (iommu->pasid_table) {
 		free_pages((unsigned long)iommu->pasid_table, order);
@@ -371,8 +377,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		}
 		svm->iommu = iommu;
 
-		if (pasid_max > 2 << ecap_pss(iommu->ecap))
-			pasid_max = 2 << ecap_pss(iommu->ecap);
+		if (pasid_max > iommu->pasid_max)
+			pasid_max = iommu->pasid_max;
 
 		/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
 		ret = idr_alloc(&iommu->pasid_idr, svm,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 2d9b650047a5..d49e26c6cdc7 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -429,6 +429,7 @@ struct intel_iommu {
 	struct page_req_dsc *prq;
 	unsigned char prq_name[16];    /* Name for PRQ interrupt */
 	struct idr pasid_idr;
+	u32 pasid_max;
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
 	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
-- 
cgit v1.2.3


From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 14 Nov 2016 19:46:12 +0100
Subject: sched/autogroup: Do not use autogroup->tg in zombie threads

Exactly because for_each_thread() in autogroup_move_group() can't see it
and update its ->sched_task_group before _put() and possibly free().

So the exiting task needs another sched_move_task() before exit_notify()
and we need to re-introduce the PF_EXITING (or similar) check removed by
the previous change for another reason.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hartsjc@redhat.com
Cc: vbendel@redhat.com
Cc: vlovejoy@redhat.com
Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h     |  2 ++
 kernel/exit.c             |  1 +
 kernel/sched/auto_group.c | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..e9c009dc3a4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
 extern void sched_autogroup_fork(struct signal_struct *sig);
 extern void sched_autogroup_exit(struct signal_struct *sig);
+extern void sched_autogroup_exit_task(struct task_struct *p);
 #ifdef CONFIG_PROC_FS
 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
 extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
@@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { }
 static inline void sched_autogroup_detach(struct task_struct *p) { }
 static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit_task(struct task_struct *p) { }
 #endif
 
 extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45ebbe3..3076f3089919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -836,6 +836,7 @@ void __noreturn do_exit(long code)
 	 */
 	perf_event_exit_task(tsk);
 
+	sched_autogroup_exit_task(tsk);
 	cgroup_exit(tsk);
 
 	/*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index ad2b19ad6ca0..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 	 * If we race with autogroup_move_group() the caller can use the old
 	 * value of signal->autogroup but in this case sched_move_task() will
 	 * be called again before autogroup_kref_put().
+	 *
+	 * However, there is no way sched_autogroup_exit_task() could tell us
+	 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
 	 */
+	if (p->flags & PF_EXITING)
+		return false;
+
 	return true;
 }
 
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+	/*
+	 * We are going to call exit_notify() and autogroup_move_group() can't
+	 * see this thread after that: we can no longer use signal->autogroup.
+	 * See the PF_EXITING check in task_wants_autogroup().
+	 */
+	sched_move_task(p);
+}
+
 static void
 autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 {
@@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	 * In the latter case for_each_thread() can not miss a migrating thread,
 	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
 	 * can't be removed from thread list, we hold ->siglock.
+	 *
+	 * If an exiting thread was already removed from thread list we rely on
+	 * sched_autogroup_exit_task().
 	 */
 	for_each_thread(p, t)
 		sched_move_task(t);
-- 
cgit v1.2.3


From e784930bd645e7df78c66e7872fec282b0620075 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 2 Nov 2016 16:35:51 -0600
Subject: PCI: Export pcie_find_root_port

Export pcie_find_root_port() so we can use it outside of PCIe-AER error
injection.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/aer/aer_inject.c | 14 --------------
 include/linux/pci.h               | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
index db553dc22c8e..2b6a59266689 100644
--- a/drivers/pci/pcie/aer/aer_inject.c
+++ b/drivers/pci/pcie/aer/aer_inject.c
@@ -307,20 +307,6 @@ out:
 	return 0;
 }
 
-static struct pci_dev *pcie_find_root_port(struct pci_dev *dev)
-{
-	while (1) {
-		if (!pci_is_pcie(dev))
-			break;
-		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
-			return dev;
-		if (!dev->bus->self)
-			break;
-		dev = dev->bus->self;
-	}
-	return NULL;
-}
-
 static int find_aer_device_iter(struct device *device, void *data)
 {
 	struct pcie_device **result = data;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0e49f70dbd9b..a38772a85588 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1928,6 +1928,20 @@ static inline int pci_pcie_type(const struct pci_dev *dev)
 	return (pcie_caps_reg(dev) & PCI_EXP_FLAGS_TYPE) >> 4;
 }
 
+static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev)
+{
+	while (1) {
+		if (!pci_is_pcie(dev))
+			break;
+		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+			return dev;
+		if (!dev->bus->self)
+			break;
+		dev = dev->bus->self;
+	}
+	return NULL;
+}
+
 void pci_request_acs(void);
 bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags);
 bool pci_acs_path_enabled(struct pci_dev *start,
-- 
cgit v1.2.3


From 920c1cd36642ac21a7b2fdc47ab44b9634d570f9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 21 Nov 2016 18:28:36 -0800
Subject: netdevice.h: fix kernel-doc warning

Fix kernel-doc warning in <linux/netdevice.h> (missing ':'):

..//include/linux/netdevice.h:1904: warning: No description found for parameter 'prio_tc_map[TC_BITMASK + 1]'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf04a46f6d5b..e16a2a980ea8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1619,7 +1619,7 @@ enum netdev_priv_flags {
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
  *	@tc_to_txq:	XXX: need comments on this one
- *	@prio_tc_map	XXX: need comments on this one
+ *	@prio_tc_map:	XXX: need comments on this one
  *
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
-- 
cgit v1.2.3


From b4353708f5a1c084fd73f1b6fd243b142157b173 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Sun, 27 Nov 2016 19:20:51 +0200
Subject: Revert "net/mlx4_en: Avoid unregister_netdev at shutdown flow"

This reverts commit 9d76931180557270796f9631e2c79b9c7bb3c9fb.

Using unregister_netdev at shutdown flow prevents calling
the netdev's ndos or trying to access its freed resources.

This fixes crashes like the following:
 Call Trace:
  [<ffffffff81587a6e>] dev_get_phys_port_id+0x1e/0x30
  [<ffffffff815a36ce>] rtnl_fill_ifinfo+0x4be/0xff0
  [<ffffffff815a53f3>] rtmsg_ifinfo_build_skb+0x73/0xe0
  [<ffffffff815a5476>] rtmsg_ifinfo.part.27+0x16/0x50
  [<ffffffff815a54c8>] rtmsg_ifinfo+0x18/0x20
  [<ffffffff8158a6c6>] netdev_state_change+0x46/0x50
  [<ffffffff815a5e78>] linkwatch_do_dev+0x38/0x50
  [<ffffffff815a6165>] __linkwatch_run_queue+0xf5/0x170
  [<ffffffff815a6205>] linkwatch_event+0x25/0x30
  [<ffffffff81099a82>] process_one_work+0x152/0x400
  [<ffffffff8109a325>] worker_thread+0x125/0x4b0
  [<ffffffff8109a200>] ? rescuer_thread+0x350/0x350
  [<ffffffff8109fc6a>] kthread+0xca/0xe0
  [<ffffffff8109fba0>] ? kthread_park+0x60/0x60
  [<ffffffff816a1285>] ret_from_fork+0x25/0x30

Fixes: 9d7693118055 ("net/mlx4_en: Avoid unregister_netdev at shutdown flow")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Reported-by: Steve Wise <swise@opengridcomputing.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 ++---------------
 drivers/net/ethernet/mellanox/mlx4/main.c      |  5 +----
 include/linux/mlx4/device.h                    |  1 -
 3 files changed, 3 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index a60f635da78b..fb8bb027b69c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2079,13 +2079,6 @@ err:
 	return -ENOMEM;
 }
 
-static void mlx4_en_shutdown(struct net_device *dev)
-{
-	rtnl_lock();
-	netif_device_detach(dev);
-	mlx4_en_close(dev);
-	rtnl_unlock();
-}
 
 static int mlx4_en_copy_priv(struct mlx4_en_priv *dst,
 			     struct mlx4_en_priv *src,
@@ -2162,8 +2155,6 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
-	bool shutdown = mdev->dev->persist->interface_state &
-					    MLX4_INTERFACE_STATE_SHUTDOWN;
 
 	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
 
@@ -2171,10 +2162,7 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 	if (priv->registered) {
 		devlink_port_type_clear(mlx4_get_devlink_port(mdev->dev,
 							      priv->port));
-		if (shutdown)
-			mlx4_en_shutdown(dev);
-		else
-			unregister_netdev(dev);
+		unregister_netdev(dev);
 	}
 
 	if (priv->allocated)
@@ -2203,8 +2191,7 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 	kfree(priv->tx_ring);
 	kfree(priv->tx_cq);
 
-	if (!shutdown)
-		free_netdev(dev);
+	free_netdev(dev);
 }
 
 static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6f4e67bc3538..75d07fa9d0b1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -4147,11 +4147,8 @@ static void mlx4_shutdown(struct pci_dev *pdev)
 
 	mlx4_info(persist->dev, "mlx4_shutdown was called\n");
 	mutex_lock(&persist->interface_state_mutex);
-	if (persist->interface_state & MLX4_INTERFACE_STATE_UP) {
-		/* Notify mlx4 clients that the kernel is being shut down */
-		persist->interface_state |= MLX4_INTERFACE_STATE_SHUTDOWN;
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
 		mlx4_unload_one(pdev);
-	}
 	mutex_unlock(&persist->interface_state_mutex);
 }
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 3be7abd6e722..c9f379689dd0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -476,7 +476,6 @@ enum {
 enum {
 	MLX4_INTERFACE_STATE_UP		= 1 << 0,
 	MLX4_INTERFACE_STATE_DELETION	= 1 << 1,
-	MLX4_INTERFACE_STATE_SHUTDOWN	= 1 << 2,
 };
 
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
-- 
cgit v1.2.3


From 3f65047c853a2a5abcd8ac1984af3452b5df4ada Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 28 Nov 2016 19:24:55 +0100
Subject: of_mdio: add helper to deregister fixed-link PHYs

Add helper to deregister fixed-link PHYs registered using
of_phy_register_fixed_link().

Convert the two drivers that care to deregister their fixed-link PHYs to
use the new helper, but note that most drivers currently fail to do so.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw.c | 16 ++--------------
 drivers/of/of_mdio.c           | 15 +++++++++++++++
 include/linux/of_mdio.h        |  4 ++++
 net/dsa/dsa.c                  | 12 ++----------
 4 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 58947aae31c7..9f0646512624 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2459,20 +2459,8 @@ static void cpsw_remove_dt(struct platform_device *pdev)
 		if (strcmp(slave_node->name, "slave"))
 			continue;
 
-		if (of_phy_is_fixed_link(slave_node)) {
-			struct phy_device *phydev;
-
-			phydev = of_phy_find_device(slave_node);
-			if (phydev) {
-				fixed_phy_unregister(phydev);
-				/* Put references taken by
-				 * of_phy_find_device() and
-				 * of_phy_register_fixed_link().
-				 */
-				phy_device_free(phydev);
-				phy_device_free(phydev);
-			}
-		}
+		if (of_phy_is_fixed_link(slave_node))
+			of_phy_deregister_fixed_link(slave_node);
 
 		of_node_put(slave_data->phy_node);
 
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 5a3145a02547..262281bd68fa 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -490,3 +490,18 @@ int of_phy_register_fixed_link(struct device_node *np)
 	return -ENODEV;
 }
 EXPORT_SYMBOL(of_phy_register_fixed_link);
+
+void of_phy_deregister_fixed_link(struct device_node *np)
+{
+	struct phy_device *phydev;
+
+	phydev = of_phy_find_device(np);
+	if (!phydev)
+		return;
+
+	fixed_phy_unregister(phydev);
+
+	put_device(&phydev->mdio.dev);	/* of_phy_find_device() */
+	phy_device_free(phydev);	/* fixed_phy_register() */
+}
+EXPORT_SYMBOL(of_phy_deregister_fixed_link);
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index 2ab233661ae5..a58cca8bcb29 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -29,6 +29,7 @@ struct phy_device *of_phy_attach(struct net_device *dev,
 extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np);
 extern int of_mdio_parse_addr(struct device *dev, const struct device_node *np);
 extern int of_phy_register_fixed_link(struct device_node *np);
+extern void of_phy_deregister_fixed_link(struct device_node *np);
 extern bool of_phy_is_fixed_link(struct device_node *np);
 
 #else /* CONFIG_OF */
@@ -83,6 +84,9 @@ static inline int of_phy_register_fixed_link(struct device_node *np)
 {
 	return -ENOSYS;
 }
+static inline void of_phy_deregister_fixed_link(struct device_node *np)
+{
+}
 static inline bool of_phy_is_fixed_link(struct device_node *np)
 {
 	return false;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index cb0091b99592..7899919cd9f0 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -506,16 +506,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 
 void dsa_cpu_dsa_destroy(struct device_node *port_dn)
 {
-	struct phy_device *phydev;
-
-	if (of_phy_is_fixed_link(port_dn)) {
-		phydev = of_phy_find_device(port_dn);
-		if (phydev) {
-			fixed_phy_unregister(phydev);
-			put_device(&phydev->mdio.dev);
-			phy_device_free(phydev);
-		}
-	}
+	if (of_phy_is_fixed_link(port_dn))
+		of_phy_deregister_fixed_link(port_dn);
 }
 
 static void dsa_switch_destroy(struct dsa_switch *ds)
-- 
cgit v1.2.3


From 045d599a286bc01daa3510d59272440a17b23c2e Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 30 Nov 2016 15:54:13 -0800
Subject: kasan: update kasan_global for gcc 7

kasan_global struct is part of compiler/runtime ABI.  gcc revision
241983 has added a new field to kasan_global struct.  Update kernel
definition of kasan_global struct to include the new field.

Without this patch KASAN is broken with gcc 7.

Link: http://lkml.kernel.org/r/1479219743-28682-1-git-send-email-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: <stable@vger.kernel.org>	[4.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 4 +++-
 mm/kasan/kasan.h             | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 432f5c97e18f..928e5ca0caee 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -263,7 +263,9 @@
 #endif
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */
 
-#if GCC_VERSION >= 50000
+#if GCC_VERSION >= 70000
+#define KASAN_ABI_VERSION 5
+#elif GCC_VERSION >= 50000
 #define KASAN_ABI_VERSION 4
 #elif GCC_VERSION >= 40902
 #define KASAN_ABI_VERSION 3
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index e5c2181fee6f..03f4545b103d 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -53,6 +53,9 @@ struct kasan_global {
 #if KASAN_ABI_VERSION >= 4
 	struct kasan_source_location *location;
 #endif
+#if KASAN_ABI_VERSION >= 5
+	char *odr_indicator;
+#endif
 };
 
 /**
-- 
cgit v1.2.3


From 5cbc198ae08d84bd416b672ad8bd1222acd0855c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 30 Nov 2016 15:54:19 -0800
Subject: mm: fix false-positive WARN_ON() in truncate/invalidate for hugetlb

Hugetlb pages have ->index in size of the huge pages (PMD_SIZE or
PUD_SIZE), not in PAGE_SIZE as other types of pages.  This means we
cannot user page_to_pgoff() to check whether we've got the right page
for the radix-tree index.

Let's introduce page_to_index() which would return radix-tree index for
given page.

We will be able to get rid of this once hugetlb will be switched to
multi-order entries.

Fixes: fc127da085c2 ("truncate: handle file thp")
Link: http://lkml.kernel.org/r/20161123093053.mjbnvn5zwxw5e6lk@black.fi.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Doug Nelson <doug.nelson@intel.com>
Tested-by: Doug Nelson <doug.nelson@intel.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 21 +++++++++++++++------
 mm/truncate.c           |  8 ++++----
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index dd15d39e1985..7dbe9148b2f8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -374,16 +374,13 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 }
 
 /*
- * Get the offset in PAGE_SIZE.
- * (TODO: hugepage should have ->index in PAGE_SIZE)
+ * Get index of the page with in radix-tree
+ * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
  */
-static inline pgoff_t page_to_pgoff(struct page *page)
+static inline pgoff_t page_to_index(struct page *page)
 {
 	pgoff_t pgoff;
 
-	if (unlikely(PageHeadHuge(page)))
-		return page->index << compound_order(page);
-
 	if (likely(!PageTransTail(page)))
 		return page->index;
 
@@ -396,6 +393,18 @@ static inline pgoff_t page_to_pgoff(struct page *page)
 	return pgoff;
 }
 
+/*
+ * Get the offset in PAGE_SIZE.
+ * (TODO: hugepage should have ->index in PAGE_SIZE)
+ */
+static inline pgoff_t page_to_pgoff(struct page *page)
+{
+	if (unlikely(PageHeadHuge(page)))
+		return page->index << compound_order(page);
+
+	return page_to_index(page);
+}
+
 /*
  * Return byte-offset into filesystem object for page.
  */
diff --git a/mm/truncate.c b/mm/truncate.c
index a01cce450a26..8d8c62d89e6d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -283,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 
 			if (!trylock_page(page))
 				continue;
-			WARN_ON(page_to_pgoff(page) != index);
+			WARN_ON(page_to_index(page) != index);
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				continue;
@@ -371,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			}
 
 			lock_page(page);
-			WARN_ON(page_to_pgoff(page) != index);
+			WARN_ON(page_to_index(page) != index);
 			wait_on_page_writeback(page);
 			truncate_inode_page(mapping, page);
 			unlock_page(page);
@@ -492,7 +492,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			if (!trylock_page(page))
 				continue;
 
-			WARN_ON(page_to_pgoff(page) != index);
+			WARN_ON(page_to_index(page) != index);
 
 			/* Middle of THP: skip */
 			if (PageTransTail(page)) {
@@ -612,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 			}
 
 			lock_page(page);
-			WARN_ON(page_to_pgoff(page) != index);
+			WARN_ON(page_to_index(page) != index);
 			if (page->mapping != mapping) {
 				unlock_page(page);
 				continue;
-- 
cgit v1.2.3