From 7b785645e8f13e17cbce492708cf6e7039d32e46 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 May 2019 10:12:46 -0400 Subject: mm: fix page cache convergence regression Since a28334862993 ("page cache: Finish XArray conversion"), on most major Linux distributions, the page cache doesn't correctly transition when the hot data set is changing, and leaves the new pages thrashing indefinitely instead of kicking out the cold ones. On a freshly booted, freshly ssh'd into virtual machine with 1G RAM running stock Arch Linux: [root@ham ~]# ./reclaimtest.sh + dd of=workingset-a bs=1M count=0 seek=600 + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + ./mincore workingset-a 153600/153600 workingset-a + dd of=workingset-b bs=1M count=0 seek=600 + cat workingset-b + cat workingset-b + cat workingset-b + cat workingset-b + ./mincore workingset-a workingset-b 104029/153600 workingset-a 120086/153600 workingset-b + cat workingset-b + cat workingset-b + cat workingset-b + cat workingset-b + ./mincore workingset-a workingset-b 104029/153600 workingset-a 120268/153600 workingset-b workingset-b is a 600M file on a 1G host that is otherwise entirely idle. No matter how often it's being accessed, it won't get cached. While investigating, I noticed that the non-resident information gets aggressively reclaimed - /proc/vmstat::workingset_nodereclaim. This is a problem because a workingset transition like this relies on the non-resident information tracked in the page cache tree of evicted file ranges: when the cache faults are refaults of recently evicted cache, we challenge the existing active set, and that allows a new workingset to establish itself. Tracing the shrinker that maintains this memory revealed that all page cache tree nodes were allocated to the root cgroup. This is a problem, because 1) the shrinker sizes the amount of non-resident information it keeps to the size of the cgroup's other memory and 2) on most major Linux distributions, only kernel threads live in the root cgroup and everything else gets put into services or session groups: [root@ham ~]# cat /proc/self/cgroup 0::/user.slice/user-0.slice/session-c1.scope As a result, we basically maintain no non-resident information for the workloads running on the system, thus breaking the caching algorithm. Looking through the code, I found the culprit in the above-mentioned patch: when switching from the radix tree to xarray, it dropped the __GFP_ACCOUNT flag from the tree node allocations - the flag that makes sure the allocated memory gets charged to and tracked by the cgroup of the calling process - in this case, the one doing the fault. To fix this, allow xarray users to specify per-tree flag that makes xarray allocate nodes using __GFP_ACCOUNT. Then restore the page cache tree annotation to request such cgroup tracking for the cache nodes. With this patch applied, the page cache correctly converges on new workingsets again after just a few iterations: [root@ham ~]# ./reclaimtest.sh + dd of=workingset-a bs=1M count=0 seek=600 + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + cat workingset-a + ./mincore workingset-a 153600/153600 workingset-a + dd of=workingset-b bs=1M count=0 seek=600 + cat workingset-b + ./mincore workingset-a workingset-b 124607/153600 workingset-a 87876/153600 workingset-b + cat workingset-b + ./mincore workingset-a workingset-b 81313/153600 workingset-a 133321/153600 workingset-b + cat workingset-b + ./mincore workingset-a workingset-b 63036/153600 workingset-a 153600/153600 workingset-b Cc: stable@vger.kernel.org # 4.20+ Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 0e01e6129145..5921599b6dc4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -265,6 +265,7 @@ enum xa_lock_type { #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) +#define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) -- cgit v1.2.3 From 191f5c2ed4b6fabacf1f3500242047bd844d0c3a Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Mon, 10 Jun 2019 06:24:04 +0000 Subject: mtd: spi-nor: use 16-bit WRR command when QE is set on spansion flashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SPI memory devices from different manufacturers have widely different configurations for Status, Control and Configuration registers. JEDEC 216C defines a new map for these common register bits and their functions, and describes how the individual bits may be accessed for a specific device. For the JEDEC 216B compliant flashes, we can partially deduce Status and Configuration registers functions by inspecting the 16th DWORD of BFPT. Older flashes that don't declare the SFDP tables (SPANSION FL512SAIFG1 311QQ063 A ©11 SPANSION) let the software decide how to interact with these registers. The commit dcb4b22eeaf4 ("spi-nor: s25fl512s supports region locking") uncovered a probe error for s25fl512s, when the Quad Enable bit CR[1] was set to one in the bootloader. When this bit is one, only the Write Status (01h) command with two data byts may be used, the 01h command with one data byte is not recognized and hence the error when trying to clear the block protection bits. Fix the above by using the Write Status (01h) command with two data bytes when the Quad Enable bit is one. Backward compatibility should be fine. The newly introduced spi_nor_spansion_clear_sr_bp() is tightly coupled with the spansion_quad_enable() function. Both assume that the Write Register with 16 bits, together with the Read Configuration Register (35h) instructions are supported. Fixes: dcb4b22eeaf44f91 ("spi-nor: s25fl512s supports region locking") Reported-by: Geert Uytterhoeven Signed-off-by: Tudor Ambarus Tested-by: Jonas Bonn Tested-by: Geert Uytterhoeven Reviewed-by: Vignesh Raghavendra Tested-by: Vignesh Raghavendra Signed-off-by: Miquel Raynal --- drivers/mtd/spi-nor/spi-nor.c | 119 ++++++++++++++++++++++++++++++++++++++---- include/linux/mtd/spi-nor.h | 3 ++ 2 files changed, 111 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 73172d7f512b..0c2ec1c21434 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -1636,6 +1636,95 @@ static int sr2_bit7_quad_enable(struct spi_nor *nor) return 0; } +/** + * spi_nor_clear_sr_bp() - clear the Status Register Block Protection bits. + * @nor: pointer to a 'struct spi_nor' + * + * Read-modify-write function that clears the Block Protection bits from the + * Status Register without affecting other bits. + * + * Return: 0 on success, -errno otherwise. + */ +static int spi_nor_clear_sr_bp(struct spi_nor *nor) +{ + int ret; + u8 mask = SR_BP2 | SR_BP1 | SR_BP0; + + ret = read_sr(nor); + if (ret < 0) { + dev_err(nor->dev, "error while reading status register\n"); + return ret; + } + + write_enable(nor); + + ret = write_sr(nor, ret & ~mask); + if (ret) { + dev_err(nor->dev, "write to status register failed\n"); + return ret; + } + + ret = spi_nor_wait_till_ready(nor); + if (ret) + dev_err(nor->dev, "timeout while writing status register\n"); + return ret; +} + +/** + * spi_nor_spansion_clear_sr_bp() - clear the Status Register Block Protection + * bits on spansion flashes. + * @nor: pointer to a 'struct spi_nor' + * + * Read-modify-write function that clears the Block Protection bits from the + * Status Register without affecting other bits. The function is tightly + * coupled with the spansion_quad_enable() function. Both assume that the Write + * Register with 16 bits, together with the Read Configuration Register (35h) + * instructions are supported. + * + * Return: 0 on success, -errno otherwise. + */ +static int spi_nor_spansion_clear_sr_bp(struct spi_nor *nor) +{ + int ret; + u8 mask = SR_BP2 | SR_BP1 | SR_BP0; + u8 sr_cr[2] = {0}; + + /* Check current Quad Enable bit value. */ + ret = read_cr(nor); + if (ret < 0) { + dev_err(nor->dev, + "error while reading configuration register\n"); + return ret; + } + + /* + * When the configuration register Quad Enable bit is one, only the + * Write Status (01h) command with two data bytes may be used. + */ + if (ret & CR_QUAD_EN_SPAN) { + sr_cr[1] = ret; + + ret = read_sr(nor); + if (ret < 0) { + dev_err(nor->dev, + "error while reading status register\n"); + return ret; + } + sr_cr[0] = ret & ~mask; + + ret = write_sr_cr(nor, sr_cr); + if (ret) + dev_err(nor->dev, "16-bit write register failed\n"); + return ret; + } + + /* + * If the Quad Enable bit is zero, use the Write Status (01h) command + * with one data byte. + */ + return spi_nor_clear_sr_bp(nor); +} + /* Used when the "_ext_id" is two bytes at most */ #define INFO(_jedec_id, _ext_id, _sector_size, _n_sectors, _flags) \ .id = { \ @@ -3660,6 +3749,8 @@ static int spi_nor_init_params(struct spi_nor *nor, default: /* Kept only for backward compatibility purpose. */ params->quad_enable = spansion_quad_enable; + if (nor->clear_sr_bp) + nor->clear_sr_bp = spi_nor_spansion_clear_sr_bp; break; } @@ -3912,17 +4003,13 @@ static int spi_nor_init(struct spi_nor *nor) { int err; - /* - * Atmel, SST, Intel/Numonyx, and others serial NOR tend to power up - * with the software protection bits set - */ - if (JEDEC_MFR(nor->info) == SNOR_MFR_ATMEL || - JEDEC_MFR(nor->info) == SNOR_MFR_INTEL || - JEDEC_MFR(nor->info) == SNOR_MFR_SST || - nor->info->flags & SPI_NOR_HAS_LOCK) { - write_enable(nor); - write_sr(nor, 0); - spi_nor_wait_till_ready(nor); + if (nor->clear_sr_bp) { + err = nor->clear_sr_bp(nor); + if (err) { + dev_err(nor->dev, + "fail to clear block protection bits\n"); + return err; + } } if (nor->quad_enable) { @@ -4047,6 +4134,16 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, if (info->flags & SPI_S3AN) nor->flags |= SNOR_F_READY_XSR_RDY; + /* + * Atmel, SST, Intel/Numonyx, and others serial NOR tend to power up + * with the software protection bits set. + */ + if (JEDEC_MFR(nor->info) == SNOR_MFR_ATMEL || + JEDEC_MFR(nor->info) == SNOR_MFR_INTEL || + JEDEC_MFR(nor->info) == SNOR_MFR_SST || + nor->info->flags & SPI_NOR_HAS_LOCK) + nor->clear_sr_bp = spi_nor_clear_sr_bp; + /* Parse the Serial Flash Discoverable Parameters table. */ ret = spi_nor_init_params(nor, ¶ms); if (ret) diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index b3d360b0ee3d..9f57cdfcc93d 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -373,6 +373,8 @@ struct flash_info; * @flash_unlock: [FLASH-SPECIFIC] unlock a region of the SPI NOR * @flash_is_locked: [FLASH-SPECIFIC] check if a region of the SPI NOR is * @quad_enable: [FLASH-SPECIFIC] enables SPI NOR quad mode + * @clear_sr_bp: [FLASH-SPECIFIC] clears the Block Protection Bits from + * the SPI NOR Status Register. * completely locked * @priv: the private data */ @@ -410,6 +412,7 @@ struct spi_nor { int (*flash_unlock)(struct spi_nor *nor, loff_t ofs, uint64_t len); int (*flash_is_locked)(struct spi_nor *nor, loff_t ofs, uint64_t len); int (*quad_enable)(struct spi_nor *nor); + int (*clear_sr_bp)(struct spi_nor *nor); void *priv; }; -- cgit v1.2.3 From e321d02db87af7840da29ef833a2a71fc0eab198 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 May 2019 15:08:30 -0700 Subject: perf/x86: Disable extended registers for non-supported PMUs The perf fuzzer caused Skylake machine to crash: [ 9680.085831] Call Trace: [ 9680.088301] [ 9680.090363] perf_output_sample_regs+0x43/0xa0 [ 9680.094928] perf_output_sample+0x3aa/0x7a0 [ 9680.099181] perf_event_output_forward+0x53/0x80 [ 9680.103917] __perf_event_overflow+0x52/0xf0 [ 9680.108266] ? perf_trace_run_bpf_submit+0xc0/0xc0 [ 9680.113108] perf_swevent_hrtimer+0xe2/0x150 [ 9680.117475] ? check_preempt_wakeup+0x181/0x230 [ 9680.122091] ? check_preempt_curr+0x62/0x90 [ 9680.126361] ? ttwu_do_wakeup+0x19/0x140 [ 9680.130355] ? try_to_wake_up+0x54/0x460 [ 9680.134366] ? reweight_entity+0x15b/0x1a0 [ 9680.138559] ? __queue_work+0x103/0x3f0 [ 9680.142472] ? update_dl_rq_load_avg+0x1cd/0x270 [ 9680.147194] ? timerqueue_del+0x1e/0x40 [ 9680.151092] ? __remove_hrtimer+0x35/0x70 [ 9680.155191] __hrtimer_run_queues+0x100/0x280 [ 9680.159658] hrtimer_interrupt+0x100/0x220 [ 9680.163835] smp_apic_timer_interrupt+0x6a/0x140 [ 9680.168555] apic_timer_interrupt+0xf/0x20 [ 9680.172756] The XMM registers can only be collected by PEBS hardware events on the platforms with PEBS baseline support, e.g. Icelake, not software/probe events. Add capabilities flag PERF_PMU_CAP_EXTENDED_REGS to indicate the PMU which support extended registers. For X86, the extended registers are XMM registers. Add has_extended_regs() to check if extended registers are applied. The generic code define the mask of extended registers as 0 if arch headers haven't overridden it. Originally-by: Peter Zijlstra (Intel) Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: 878068ea270e ("perf/x86: Support outputting XMM registers") Link: https://lkml.kernel.org/r/1559081314-9714-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 1 + arch/x86/include/uapi/asm/perf_regs.h | 3 +++ include/linux/perf_event.h | 1 + include/linux/perf_regs.h | 8 ++++++++ kernel/events/core.c | 18 ++++++++++++++---- 5 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7acc526b4ad2..6cb38ab02c8a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2020,6 +2020,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; pebs_qual = "-baseline"; + x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { /* Only basic record supported */ x86_pmu.pebs_no_xmm_regs = 1; diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index ac67bbea10ca..7c9d2bb3833b 100644 --- a/arch/x86/include/uapi/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h @@ -52,4 +52,7 @@ enum perf_event_x86_regs { /* These include both GPRs and XMMX registers */ PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2, }; + +#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1)) + #endif /* _ASM_X86_PERF_REGS_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0ab99c7b652d..2bca72f3028b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -241,6 +241,7 @@ struct perf_event; #define PERF_PMU_CAP_NO_INTERRUPT 0x01 #define PERF_PMU_CAP_NO_NMI 0x02 #define PERF_PMU_CAP_AUX_NO_SG 0x04 +#define PERF_PMU_CAP_EXTENDED_REGS 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 #define PERF_PMU_CAP_ITRACE 0x20 #define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40 diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h index 476747456bca..2d12e97d5e7b 100644 --- a/include/linux/perf_regs.h +++ b/include/linux/perf_regs.h @@ -11,6 +11,11 @@ struct perf_regs { #ifdef CONFIG_HAVE_PERF_REGS #include + +#ifndef PERF_REG_EXTENDED_MASK +#define PERF_REG_EXTENDED_MASK 0 +#endif + u64 perf_reg_value(struct pt_regs *regs, int idx); int perf_reg_validate(u64 mask); u64 perf_reg_abi(struct task_struct *task); @@ -18,6 +23,9 @@ void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs, struct pt_regs *regs_user_copy); #else + +#define PERF_REG_EXTENDED_MASK 0 + static inline u64 perf_reg_value(struct pt_regs *regs, int idx) { return 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index 8d1c62df20a7..f85929ce13be 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10036,6 +10036,12 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static inline bool has_extended_regs(struct perf_event *event) +{ + return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || + (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); +} + static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; @@ -10067,12 +10073,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) + ret = -EOPNOTSUPP; + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) { - if (event->destroy) - event->destroy(event); + event_has_any_exclude_flag(event)) ret = -EINVAL; - } + + if (ret && event->destroy) + event->destroy(event); } if (ret) -- cgit v1.2.3 From b12bbdc5dd883f6575f57e529af26cd2c521b320 Mon Sep 17 00:00:00 2001 From: Hyungwoo Yang Date: Wed, 5 Jun 2019 21:52:27 -0700 Subject: HID: intel-ish-hid: fix wrong driver_data usage Currently, in suspend() and resume(), ishtp client drivers are using driver_data to get "struct ishtp_cl_device" object which is set by bus driver. It's wrong since the driver_data should not be owned bus. driver_data should be owned by the corresponding ishtp client driver. Due to this, some ishtp client driver like cros_ec_ishtp which uses its driver_data to transfer its data to its child doesn't work correctly. So this patch removes setting driver_data in bus drier and instead of using driver_data to get "struct ishtp_cl_device", since "struct device" is embedded in "struct ishtp_cl_device", we introduce a helper function that returns "struct ishtp_cl_device" from "struct device". Signed-off-by: Hyungwoo Yang Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ishtp-hid-client.c | 4 ++-- drivers/hid/intel-ish-hid/ishtp/bus.c | 15 ++++++++++++++- include/linux/intel-ish-client-if.h | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c index 56777a43e69c..19102a3be4ca 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c +++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c @@ -899,7 +899,7 @@ static int hid_ishtp_cl_reset(struct ishtp_cl_device *cl_device) */ static int hid_ishtp_cl_suspend(struct device *device) { - struct ishtp_cl_device *cl_device = dev_get_drvdata(device); + struct ishtp_cl_device *cl_device = ishtp_dev_to_cl_device(device); struct ishtp_cl *hid_ishtp_cl = ishtp_get_drvdata(cl_device); struct ishtp_cl_data *client_data = ishtp_get_client_data(hid_ishtp_cl); @@ -920,7 +920,7 @@ static int hid_ishtp_cl_suspend(struct device *device) */ static int hid_ishtp_cl_resume(struct device *device) { - struct ishtp_cl_device *cl_device = dev_get_drvdata(device); + struct ishtp_cl_device *cl_device = ishtp_dev_to_cl_device(device); struct ishtp_cl *hid_ishtp_cl = ishtp_get_drvdata(cl_device); struct ishtp_cl_data *client_data = ishtp_get_client_data(hid_ishtp_cl); diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c index fb8ca12955b4..4b4a6047dc72 100644 --- a/drivers/hid/intel-ish-hid/ishtp/bus.c +++ b/drivers/hid/intel-ish-hid/ishtp/bus.c @@ -479,7 +479,6 @@ static struct ishtp_cl_device *ishtp_bus_add_device(struct ishtp_device *dev, } ishtp_device_ready = true; - dev_set_drvdata(&device->dev, device); return device; } @@ -647,6 +646,20 @@ void *ishtp_get_drvdata(struct ishtp_cl_device *cl_device) } EXPORT_SYMBOL(ishtp_get_drvdata); +/** + * ishtp_dev_to_cl_device() - get ishtp_cl_device instance from device instance + * @device: device instance + * + * Get ish_cl_device instance which embeds device instance in it. + * + * Return: pointer to ishtp_cl_device instance + */ +struct ishtp_cl_device *ishtp_dev_to_cl_device(struct device *device) +{ + return to_ishtp_cl_device(device); +} +EXPORT_SYMBOL(ishtp_dev_to_cl_device); + /** * ishtp_bus_new_client() - Create a new client * @dev: ISHTP device instance diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 16255c2ca2f4..0d6b4bc191c5 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -103,6 +103,7 @@ void ishtp_put_device(struct ishtp_cl_device *cl_dev); void ishtp_get_device(struct ishtp_cl_device *cl_dev); void ishtp_set_drvdata(struct ishtp_cl_device *cl_device, void *data); void *ishtp_get_drvdata(struct ishtp_cl_device *cl_device); +struct ishtp_cl_device *ishtp_dev_to_cl_device(struct device *dev); int ishtp_register_event_cb(struct ishtp_cl_device *device, void (*read_cb)(struct ishtp_cl_device *)); struct ishtp_fw_client *ishtp_fw_cl_get_client(struct ishtp_device *dev, -- cgit v1.2.3 From 471a739a47aa7d582f0cdf9d392957d04632bae2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Jun 2019 00:20:23 +0200 Subject: PCI: PM: Avoid skipping bus-level PM on platforms without ACPI There are platforms that do not call pm_set_suspend_via_firmware(), so pm_suspend_via_firmware() returns 'false' on them, but the power states of PCI devices (PCIe ports in particular) are changed as a result of powering down core platform components during system-wide suspend. Thus the pm_suspend_via_firmware() checks in pci_pm_suspend_noirq() and pci_pm_resume_noirq() introduced by commit 3e26c5feed2a ("PCI: PM: Skip devices in D0 for suspend-to- idle") are not sufficient to determine that devices left in D0 during suspend will remain in D0 during resume and so the bus-level power management can be skipped for them. For this reason, introduce a new global suspend flag, PM_SUSPEND_FLAG_NO_PLATFORM, set it for suspend-to-idle only and replace the pm_suspend_via_firmware() checks mentioned above with checks against this flag. Fixes: 3e26c5feed2a ("PCI: PM: Skip devices in D0 for suspend-to-idle") Reported-by: Jon Hunter Tested-by: Jon Hunter Signed-off-by: Rafael J. Wysocki Tested-by: Mika Westerberg Reviewed-by: Mika Westerberg --- drivers/pci/pci-driver.c | 8 ++++---- include/linux/suspend.h | 26 ++++++++++++++++++++++++-- kernel/power/suspend.c | 3 +++ 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 98af9ecd4a90..ca3793002e2f 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -859,7 +859,7 @@ static int pci_pm_suspend_noirq(struct device *dev) pci_dev->bus->self->skip_bus_pm = true; } - if (pci_dev->skip_bus_pm && !pm_suspend_via_firmware()) { + if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) { dev_dbg(dev, "PCI PM: Skipped\n"); goto Fixup; } @@ -914,10 +914,10 @@ static int pci_pm_resume_noirq(struct device *dev) /* * In the suspend-to-idle case, devices left in D0 during suspend will * stay in D0, so it is not necessary to restore or update their - * configuration here and attempting to put them into D0 again may - * confuse some firmware, so avoid doing that. + * configuration here and attempting to put them into D0 again is + * pointless, so avoid doing that. */ - if (!pci_dev->skip_bus_pm || pm_suspend_via_firmware()) + if (!(pci_dev->skip_bus_pm && pm_suspend_no_platform())) pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 8594001e8be8..f0d262ad7b78 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -209,8 +209,9 @@ extern int suspend_valid_only_mem(suspend_state_t state); extern unsigned int pm_suspend_global_flags; -#define PM_SUSPEND_FLAG_FW_SUSPEND (1 << 0) -#define PM_SUSPEND_FLAG_FW_RESUME (1 << 1) +#define PM_SUSPEND_FLAG_FW_SUSPEND BIT(0) +#define PM_SUSPEND_FLAG_FW_RESUME BIT(1) +#define PM_SUSPEND_FLAG_NO_PLATFORM BIT(2) static inline void pm_suspend_clear_flags(void) { @@ -227,6 +228,11 @@ static inline void pm_set_resume_via_firmware(void) pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_RESUME; } +static inline void pm_set_suspend_no_platform(void) +{ + pm_suspend_global_flags |= PM_SUSPEND_FLAG_NO_PLATFORM; +} + /** * pm_suspend_via_firmware - Check if platform firmware will suspend the system. * @@ -268,6 +274,22 @@ static inline bool pm_resume_via_firmware(void) return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_RESUME); } +/** + * pm_suspend_no_platform - Check if platform may change device power states. + * + * To be called during system-wide power management transitions to sleep states + * or during the subsequent system-wide transitions back to the working state. + * + * Return 'true' if the power states of devices remain under full control of the + * kernel throughout the system-wide suspend and resume cycle in progress (that + * is, if a device is put into a certain power state during suspend, it can be + * expected to remain in that state during resume). + */ +static inline bool pm_suspend_no_platform(void) +{ + return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_NO_PLATFORM); +} + /* Suspend-to-idle state machnine. */ enum s2idle_states { S2IDLE_STATE_NONE, /* Not suspended/suspending. */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 9505101ed2bc..096211299c07 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -493,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state) pm_suspend_target_state = state; + if (state == PM_SUSPEND_TO_IDLE) + pm_set_suspend_no_platform(); + error = platform_suspend_begin(state); if (error) goto Close; -- cgit v1.2.3 From 36d6cb73d5e60cdb2724c21ccba93c964e6a16f9 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 28 Jun 2019 12:06:37 -0700 Subject: mm/dev_pfn: exclude MEMORY_DEVICE_PRIVATE while computing virtual address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The presence of struct page does not guarantee linear mapping for the pfn physical range. Device private memory which is non-coherent is excluded from linear mapping during devm_memremap_pages() though they will still have struct page coverage. Change pfn_t_to_virt() to just check for device private memory before giving out virtual address for a given pfn. pfn_t_to_virt() actually has no callers. Let's fix it for the 5.2 kernel and remove it in 5.3. Link: http://lkml.kernel.org/r/1558089514-25067-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Dan Williams Cc: Jérôme Glisse Cc: Laurent Dufour Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 7bb77850c65a..3c202a11a79e 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -68,7 +68,7 @@ static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) static inline void *pfn_t_to_virt(pfn_t pfn) { - if (pfn_t_has_page(pfn)) + if (pfn_t_has_page(pfn) && !is_device_private_page(pfn_t_to_page(pfn))) return __va(pfn_t_to_phys(pfn)); return NULL; } -- cgit v1.2.3 From 97abc889ee296faf95ca0e978340fb7b942a3e32 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 28 Jun 2019 12:06:50 -0700 Subject: signal: remove the wrong signal_pending() check in restore_user_sigmask() This is the minimal fix for stable, I'll send cleanups later. Commit 854a6ed56839 ("signal: Add restore_user_sigmask()") introduced the visible change which breaks user-space: a signal temporary unblocked by set_user_sigmask() can be delivered even if the caller returns success or timeout. Change restore_user_sigmask() to accept the additional "interrupted" argument which should be used instead of signal_pending() check, and update the callers. Eric said: : For clarity. I don't think this is required by posix, or fundamentally to : remove the races in select. It is what linux has always done and we have : applications who care so I agree this fix is needed. : : Further in any case where the semantic change that this patch rolls back : (aka where allowing a signal to be delivered and the select like call to : complete) would be advantage we can do as well if not better by using : signalfd. : : Michael is there any chance we can get this guarantee of the linux : implementation of pselect and friends clearly documented. The guarantee : that if the system call completes successfully we are guaranteed that no : signal that is unblocked by using sigmask will be delivered? Link: http://lkml.kernel.org/r/20190604134117.GA29963@redhat.com Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()") Signed-off-by: Oleg Nesterov Reported-by: Eric Wong Tested-by: Eric Wong Acked-by: "Eric W. Biederman" Acked-by: Arnd Bergmann Acked-by: Deepa Dinamani Cc: Michael Kerrisk Cc: Jens Axboe Cc: Davidlohr Bueso Cc: Jason Baron Cc: Thomas Gleixner Cc: Al Viro Cc: David Laight Cc: [5.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 28 ++++++++++++++++++++-------- fs/eventpoll.c | 4 ++-- fs/io_uring.c | 7 ++++--- fs/select.c | 18 ++++++------------ include/linux/signal.h | 2 +- kernel/signal.c | 5 +++-- 6 files changed, 36 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/fs/aio.c b/fs/aio.c index 3490d1fa0e16..c1e581dd32f5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2095,6 +2095,7 @@ SYSCALL_DEFINE6(io_pgetevents, struct __aio_sigset ksig = { NULL, }; sigset_t ksigmask, sigsaved; struct timespec64 ts; + bool interrupted; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) @@ -2108,8 +2109,10 @@ SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); - if (signal_pending(current) && !ret) + + interrupted = signal_pending(current); + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; @@ -2128,6 +2131,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32, struct __aio_sigset ksig = { NULL, }; sigset_t ksigmask, sigsaved; struct timespec64 ts; + bool interrupted; int ret; if (timeout && unlikely(get_old_timespec32(&ts, timeout))) @@ -2142,8 +2146,10 @@ SYSCALL_DEFINE6(io_pgetevents_time32, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); - if (signal_pending(current) && !ret) + + interrupted = signal_pending(current); + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; @@ -2193,6 +2199,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct __compat_aio_sigset ksig = { NULL, }; sigset_t ksigmask, sigsaved; struct timespec64 t; + bool interrupted; int ret; if (timeout && get_old_timespec32(&t, timeout)) @@ -2206,8 +2213,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); - if (signal_pending(current) && !ret) + + interrupted = signal_pending(current); + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; @@ -2226,6 +2235,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, struct __compat_aio_sigset ksig = { NULL, }; sigset_t ksigmask, sigsaved; struct timespec64 t; + bool interrupted; int ret; if (timeout && get_timespec64(&t, timeout)) @@ -2239,8 +2249,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); - if (signal_pending(current) && !ret) + + interrupted = signal_pending(current); + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c6f513100cc9..4c74c768ae43 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2325,7 +2325,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, error = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, error == -EINTR); return error; } @@ -2350,7 +2350,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, err = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, err == -EINTR); return err; } diff --git a/fs/io_uring.c b/fs/io_uring.c index 86a2bd721900..e6981d3f4468 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2201,11 +2201,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); - if (ret == -ERESTARTSYS) - ret = -EINTR; if (sig) - restore_user_sigmask(sig, &sigsaved); + restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS); + + if (ret == -ERESTARTSYS) + ret = -EINTR; return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; } diff --git a/fs/select.c b/fs/select.c index 6cbc9ff56ba0..a4d8f6e8b63c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -758,10 +758,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, return ret; ret = core_sys_select(n, inp, outp, exp, to); + restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND); ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - restore_user_sigmask(sigmask, &sigsaved); - return ret; } @@ -1106,8 +1105,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved); - + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; @@ -1142,8 +1140,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved); - + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; @@ -1350,10 +1347,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); + restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND); ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - restore_user_sigmask(sigmask, &sigsaved); - return ret; } @@ -1425,8 +1421,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved); - + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; @@ -1461,8 +1456,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved); - + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; diff --git a/include/linux/signal.h b/include/linux/signal.h index 9702016734b1..78c2bb376954 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -276,7 +276,7 @@ extern int sigprocmask(int, sigset_t *, sigset_t *); extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, sigset_t *oldset, size_t sigsetsize); extern void restore_user_sigmask(const void __user *usigmask, - sigset_t *sigsaved); + sigset_t *sigsaved, bool interrupted); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/signal.c b/kernel/signal.c index d622eac9d169..edf8915ddd54 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2912,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask); * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed in from userland for the syscalls. */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) +void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, + bool interrupted) { if (!usigmask) @@ -2922,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) * Restoring sigmask here can lead to delivering signals that the above * syscalls are intended to block because of the sigmask passed in. */ - if (signal_pending(current)) { + if (interrupted) { current->saved_sigmask = *sigsaved; set_restore_sigmask(); return; -- cgit v1.2.3 From 8f9fab480c7a87b10bb5440b5555f370272a5d59 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 28 Jun 2019 12:07:21 -0700 Subject: linux/kernel.h: fix overflow for DIV_ROUND_UP_ULL DIV_ROUND_UP_ULL adds the two arguments and then invokes DIV_ROUND_DOWN_ULL. But on a 32bit system the addition of two 32 bit values can overflow. DIV_ROUND_DOWN_ULL does it correctly and stashes the addition into a unsigned long long so cast the result to unsigned long long here to avoid the overflow condition. [akpm@linux-foundation.org: DIV_ROUND_UP_ULL must be an rval] Link: http://lkml.kernel.org/r/20190625100518.30753-1-vkoul@kernel.org Signed-off-by: Vinod Koul Reviewed-by: Andrew Morton Cc: Bjorn Andersson Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 74b1ee9027f5..0c9bc231107f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -93,7 +93,8 @@ #define DIV_ROUND_DOWN_ULL(ll, d) \ ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) -#define DIV_ROUND_UP_ULL(ll, d) DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d)) +#define DIV_ROUND_UP_ULL(ll, d) \ + DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d)) #if BITS_PER_LONG == 32 # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d) -- cgit v1.2.3 From eef778c99c0239ed0a0696ddf22ae3673f28a489 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Jul 2019 15:14:45 -0700 Subject: devres: allow const resource arguments devm_ioremap_resource() does not currently take 'const' arguments, which results in a warning from the first driver trying to do it anyway: drivers/gpio/gpio-amd-fch.c: In function 'amd_fch_gpio_probe': drivers/gpio/gpio-amd-fch.c:171:49: error: passing argument 2 of 'devm_ioremap_resource' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] priv->base = devm_ioremap_resource(&pdev->dev, &amd_fch_gpio_iores); ^~~~~~~~~~~~~~~~~~~ Change the prototype to allow it, as there is no real reason not to. Link: http://lkml.kernel.org/r/20190628150049.1108048-1-arnd@arndb.de Fixes: 9bb2e0452508 ("gpio: amd: Make resource struct const") Signed-off-by: Arnd Bergmann Acked-by: Greg Kroah-Hartman Reviewed-by: Enrico Weigelt Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: Linus Walleij Cc: Arnd Bergmann Cc: "Rafael J. Wysocki" Cc: Ulf Hansson Cc: Andy Shevchenko Cc: Heikki Krogerus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/device.h | 3 ++- lib/devres.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 848fc71c6ba6..4a295e324ac5 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -704,7 +704,8 @@ extern unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); extern void devm_free_pages(struct device *dev, unsigned long addr); -void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res); +void __iomem *devm_ioremap_resource(struct device *dev, + const struct resource *res); void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int index, diff --git a/lib/devres.c b/lib/devres.c index 69bed2f38306..6a0e9bd6524a 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -131,7 +131,8 @@ EXPORT_SYMBOL(devm_iounmap); * if (IS_ERR(base)) * return PTR_ERR(base); */ -void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res) +void __iomem *devm_ioremap_resource(struct device *dev, + const struct resource *res) { resource_size_t size; void __iomem *dest_ptr; -- cgit v1.2.3 From 69bf4b6b54fb7f52b7ea9ce28d4a360cd5ec956d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jul 2019 19:55:18 -0700 Subject: Revert "mm: page cache: store only head pages in i_pages" This reverts commit 5fd4ca2d84b249f0858ce28cf637cf25b61a398f. Mikhail Gavrilov reports that it causes the VM_BUG_ON_PAGE() in __delete_from_swap_cache() to trigger: page:ffffd6d34dff0000 refcount:1 mapcount:1 mapping:ffff97812323a689 index:0xfecec363 anon flags: 0x17fffe00080034(uptodate|lru|active|swapbacked) raw: 0017fffe00080034 ffffd6d34c67c508 ffffd6d3504b8d48 ffff97812323a689 raw: 00000000fecec363 0000000000000000 0000000100000000 ffff978433ace000 page dumped because: VM_BUG_ON_PAGE(entry != page) page->mem_cgroup:ffff978433ace000 ------------[ cut here ]------------ kernel BUG at mm/swap_state.c:170! invalid opcode: 0000 [#1] SMP NOPTI CPU: 1 PID: 221 Comm: kswapd0 Not tainted 5.2.0-0.rc2.git0.1.fc31.x86_64 #1 Hardware name: System manufacturer System Product Name/ROG STRIX X470-I GAMING, BIOS 2202 04/11/2019 RIP: 0010:__delete_from_swap_cache+0x20d/0x240 Code: 30 65 48 33 04 25 28 00 00 00 75 4a 48 83 c4 38 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 c7 c6 2f dc 0f 8a 48 89 c7 e8 93 1b fd ff <0f> 0b 48 c7 c6 a8 74 0f 8a e8 85 1b fd ff 0f 0b 48 c7 c6 a8 7d 0f RSP: 0018:ffffa982036e7980 EFLAGS: 00010046 RAX: 0000000000000021 RBX: 0000000000000040 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000000086 RDI: ffff97843d657900 RBP: 0000000000000001 R08: ffffa982036e7835 R09: 0000000000000535 R10: ffff97845e21a46c R11: ffffa982036e7835 R12: ffff978426387120 R13: 0000000000000000 R14: ffffd6d34dff0040 R15: ffffd6d34dff0000 FS: 0000000000000000(0000) GS:ffff97843d640000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00002cba88ef5000 CR3: 000000078a97c000 CR4: 00000000003406e0 Call Trace: delete_from_swap_cache+0x46/0xa0 try_to_free_swap+0xbc/0x110 swap_writepage+0x13/0x70 pageout.isra.0+0x13c/0x350 shrink_page_list+0xc14/0xdf0 shrink_inactive_list+0x1e5/0x3c0 shrink_node_memcg+0x202/0x760 shrink_node+0xe0/0x470 balance_pgdat+0x2d1/0x510 kswapd+0x220/0x420 kthread+0xfb/0x130 ret_from_fork+0x22/0x40 and it's not immediately obvious why it happens. It's too late in the rc cycle to do anything but revert for now. Link: https://lore.kernel.org/lkml/CABXGCsN9mYmBD-4GaaeW_NrDu+FDXLzr_6x+XNxfmFV6QkYCDg@mail.gmail.com/ Reported-and-bisected-by: Mikhail Gavrilov Suggested-by: Jan Kara Cc: Michal Hocko Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Kirill Shutemov Cc: William Kucharski Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 13 ----- mm/filemap.c | 146 +++++++++++++++++++++++++++++------------------- mm/huge_memory.c | 3 - mm/khugepaged.c | 4 +- mm/memfd.c | 2 - mm/migrate.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 +- 8 files changed, 94 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9ec3544baee2..fe0b29bf2df7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,19 +333,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -static inline struct page *find_subpage(struct page *page, pgoff_t offset) -{ - unsigned long mask; - - if (PageHuge(page)) - return page; - - VM_BUG_ON_PAGE(PageTail(page), page); - - mask = (1UL << compound_order(page)) - 1; - return page + (offset & mask); -} - struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/mm/filemap.c b/mm/filemap.c index df2006ba0cfa..6dd9a2274c80 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -281,11 +281,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index - * and is optimised for it to be dense. + * from the mapping. The function expects @pvec to be sorted by page index. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec. + * @pvec and takes care to delete all corresponding tail pages from the + * mapping as well. * * The function expects the i_pages lock to be held. */ @@ -294,44 +294,40 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0; + int i = 0, tail_pages = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec)) + if (i >= pagevec_count(pvec) && !tail_pages) break; - - /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - /* - * A page got inserted in our range? Skip it. We have our - * pages locked so they are protected from being removed. - * If we see a page whose index is higher than ours, it - * means our page has been removed, which shouldn't be - * possible because we're holding the PageLock. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, - page); - continue; - } - - WARN_ON_ONCE(!PageLocked(page)); - - if (page->index == xas.xa_index) + if (!tail_pages) { + /* + * Some page got inserted in our range? Skip it. We + * have our pages locked so they are protected from + * being removed. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); + continue; + } + WARN_ON_ONCE(!PageLocked(page)); + if (PageTransHuge(page) && !PageHuge(page)) + tail_pages = HPAGE_PMD_NR - 1; page->mapping = NULL; - /* Leave page->index set: truncation lookup relies on it */ - - /* - * Move to the next page in the vector if this is a regular - * page or the index is of the last sub-page of this compound - * page. - */ - if (page->index + (1UL << compound_order(page)) - 1 == - xas.xa_index) + /* + * Leave page->index set: truncation lookup relies + * upon it + */ i++; + } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); + tail_pages--; + } xas_store(&xas, NULL); total_pages++; } @@ -1498,7 +1494,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *page; + struct page *head, *page; rcu_read_lock(); repeat: @@ -1513,19 +1509,25 @@ repeat: if (!page || xa_is_value(page)) goto out; - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; + + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); goto repeat; + } /* - * Has the page moved or been split? + * Has the page moved? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(page); + put_page(head); goto repeat; } - page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1707,6 +1709,7 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1717,13 +1720,17 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto retry; - /* Has the page moved or been split? */ + /* The page was split under us? */ + if (compound_head(page) != head) + goto put_page; + + /* Has the page moved? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1732,7 +1739,7 @@ export: break; continue; put_page: - put_page(page); + put_page(head); retry: xas_reset(&xas); } @@ -1774,27 +1781,33 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { + struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto retry; - /* Has the page moved or been split? */ + /* The page was split under us? */ + if (compound_head(page) != head) + goto put_page; + + /* Has the page moved? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(page); + put_page(head); retry: xas_reset(&xas); } @@ -1839,6 +1852,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1848,19 +1862,24 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto retry; - /* Has the page moved or been split? */ + /* The page was split under us? */ + if (compound_head(page) != head) + goto put_page; + + /* Has the page moved? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) break; continue; put_page: - put_page(page); + put_page(head); retry: xas_reset(&xas); } @@ -1896,6 +1915,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { + struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1906,21 +1926,26 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto retry; - /* Has the page moved or been split? */ + /* The page was split under us? */ + if (compound_head(page) != head) + goto put_page; + + /* Has the page moved? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(page); + put_page(head); retry: xas_reset(&xas); } @@ -2603,7 +2628,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *page; + struct page *head, *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2612,19 +2637,24 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; + head = compound_head(page); + /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(page)) + if (PageLocked(head)) goto next; - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto next; - /* Has the page moved or been split? */ + /* The page was split under us? */ + if (compound_head(page) != head) + goto skip; + + /* Has the page moved? */ if (unlikely(page != xas_reload(&xas))) goto skip; - page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bb8b617e34ed..885642c82aaa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2496,9 +2496,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); - } else if (!PageAnon(page)) { - __xa_store(&head->mapping->i_pages, head[i].index, - head + i, 0); } } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0f7419938008..eaaa21b23215 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1378,7 +1378,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); nr_none++; continue; } @@ -1454,7 +1454,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; out_unlock: unlock_page(page); diff --git a/mm/memfd.c b/mm/memfd.c index 2647c898990c..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,7 +39,6 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; - page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -89,7 +88,6 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; - page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/migrate.c b/mm/migrate.c index f2ecc2855a12..e9594bc0d406 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage); + xas_store(&xas, newpage + i); } } diff --git a/mm/shmem.c b/mm/shmem.c index 1bb3b8dc8bb2..f4dce9c8670d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page); + xas_store(&xas, page + i); if (++i < nr) { xas_next(&xas); goto next; diff --git a/mm/swap_state.c b/mm/swap_state.c index eb714165afd2..85245fdec8d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page); + xas_store(&xas, page + i); xas_next(&xas); } address_space->nrpages += nr; @@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page, entry); + VM_BUG_ON_PAGE(entry != page + i, entry); set_page_private(page + i, 0); xas_next(&xas); } -- cgit v1.2.3