From 484deaedfa03fa0d10e0b1ad3facc910ccc951a8 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Fri, 26 Jul 2019 15:57:50 -0500 Subject: drm/amdgpu: Extends amdgpu vm definitions (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add RW mtype introduced for arcturus. v2: * Don't add probe-invalidation bit from UAPI * Don't add unused AMDGPU_MTYPE_ definitions Signed-off-by: Oak Zeng Signed-off-by: Felix Kuehling Reviewed-by: Christian König Reviewed-by: Shaoyun Liu Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index c99b4f2482c6..640e5356fd65 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -500,6 +500,8 @@ struct drm_amdgpu_gem_op { #define AMDGPU_VM_MTYPE_CC (3 << 5) /* Use UC MTYPE instead of default MTYPE */ #define AMDGPU_VM_MTYPE_UC (4 << 5) +/* Use RW MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_RW (5 << 5) struct drm_amdgpu_gem_va { /** GEM object handle */ -- cgit v1.2.3 From 050091ab6e832f7df0b700f1b9b596613643ada4 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 3 Sep 2019 17:55:30 -0400 Subject: drm/amdkfd: Query kfd device info by CHIP id instead of pci device id This optimizes out the pci device id usage in KFD and makes the code more maintainable. Signed-off-by: Yong Zhao Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 175 +++++------------------------ include/drm/amd_asic_type.h | 2 + 5 files changed, 37 insertions(+), 151 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 07eb29885372..9f061fe55d09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -64,6 +64,7 @@ void amdgpu_amdkfd_fini(void) void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) { const struct kfd2kgd_calls *kfd2kgd; + bool vf = amdgpu_sriov_vf(adev); switch (adev->asic_type) { #ifdef CONFIG_DRM_AMDGPU_CIK @@ -101,7 +102,7 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) } adev->kfd.dev = kgd2kfd_probe((struct kgd_dev *)adev, - adev->pdev, kfd2kgd); + adev->pdev, kfd2kgd, adev->asic_type, vf); if (adev->kfd.dev) amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size; @@ -735,7 +736,8 @@ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_10_0_get_functions(void) } struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, - const struct kfd2kgd_calls *f2g) + const struct kfd2kgd_calls *f2g, + unsigned int asic_type, bool vf) { return NULL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index c003d9275837..e39c106ac634 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -248,7 +248,8 @@ void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo); int kgd2kfd_init(void); void kgd2kfd_exit(void); struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, - const struct kfd2kgd_calls *f2g); + const struct kfd2kgd_calls *f2g, + unsigned int asic_type, bool vf); bool kgd2kfd_device_init(struct kfd_dev *kfd, const struct kgd2kfd_shared_resources *gpu_resources); void kgd2kfd_device_exit(struct kfd_dev *kfd); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e89aa2dc5c11..3ad034aa0e3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -80,7 +80,7 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 -static const char *amdgpu_asic_name[] = { +const char *amdgpu_asic_name[] = { "TAHITI", "PITCAIRN", "VERDE", diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0dc1084b5e82..267eb2e01bec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -369,133 +369,23 @@ static const struct kfd_device_info navi10_device_info = { .num_sdma_queues_per_engine = 8, }; -struct kfd_deviceid { - unsigned short did; - const struct kfd_device_info *device_info; -}; - -static const struct kfd_deviceid supported_devices[] = { -#ifdef KFD_SUPPORT_IOMMU_V2 - { 0x1304, &kaveri_device_info }, /* Kaveri */ - { 0x1305, &kaveri_device_info }, /* Kaveri */ - { 0x1306, &kaveri_device_info }, /* Kaveri */ - { 0x1307, &kaveri_device_info }, /* Kaveri */ - { 0x1309, &kaveri_device_info }, /* Kaveri */ - { 0x130A, &kaveri_device_info }, /* Kaveri */ - { 0x130B, &kaveri_device_info }, /* Kaveri */ - { 0x130C, &kaveri_device_info }, /* Kaveri */ - { 0x130D, &kaveri_device_info }, /* Kaveri */ - { 0x130E, &kaveri_device_info }, /* Kaveri */ - { 0x130F, &kaveri_device_info }, /* Kaveri */ - { 0x1310, &kaveri_device_info }, /* Kaveri */ - { 0x1311, &kaveri_device_info }, /* Kaveri */ - { 0x1312, &kaveri_device_info }, /* Kaveri */ - { 0x1313, &kaveri_device_info }, /* Kaveri */ - { 0x1315, &kaveri_device_info }, /* Kaveri */ - { 0x1316, &kaveri_device_info }, /* Kaveri */ - { 0x1317, &kaveri_device_info }, /* Kaveri */ - { 0x1318, &kaveri_device_info }, /* Kaveri */ - { 0x131B, &kaveri_device_info }, /* Kaveri */ - { 0x131C, &kaveri_device_info }, /* Kaveri */ - { 0x131D, &kaveri_device_info }, /* Kaveri */ - { 0x9870, &carrizo_device_info }, /* Carrizo */ - { 0x9874, &carrizo_device_info }, /* Carrizo */ - { 0x9875, &carrizo_device_info }, /* Carrizo */ - { 0x9876, &carrizo_device_info }, /* Carrizo */ - { 0x9877, &carrizo_device_info }, /* Carrizo */ - { 0x15DD, &raven_device_info }, /* Raven */ - { 0x15D8, &raven_device_info }, /* Raven */ -#endif - { 0x67A0, &hawaii_device_info }, /* Hawaii */ - { 0x67A1, &hawaii_device_info }, /* Hawaii */ - { 0x67A2, &hawaii_device_info }, /* Hawaii */ - { 0x67A8, &hawaii_device_info }, /* Hawaii */ - { 0x67A9, &hawaii_device_info }, /* Hawaii */ - { 0x67AA, &hawaii_device_info }, /* Hawaii */ - { 0x67B0, &hawaii_device_info }, /* Hawaii */ - { 0x67B1, &hawaii_device_info }, /* Hawaii */ - { 0x67B8, &hawaii_device_info }, /* Hawaii */ - { 0x67B9, &hawaii_device_info }, /* Hawaii */ - { 0x67BA, &hawaii_device_info }, /* Hawaii */ - { 0x67BE, &hawaii_device_info }, /* Hawaii */ - { 0x6920, &tonga_device_info }, /* Tonga */ - { 0x6921, &tonga_device_info }, /* Tonga */ - { 0x6928, &tonga_device_info }, /* Tonga */ - { 0x6929, &tonga_device_info }, /* Tonga */ - { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x6938, &tonga_device_info }, /* Tonga */ - { 0x6939, &tonga_device_info }, /* Tonga */ - { 0x7300, &fiji_device_info }, /* Fiji */ - { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ - { 0x67C0, &polaris10_device_info }, /* Polaris10 */ - { 0x67C1, &polaris10_device_info }, /* Polaris10 */ - { 0x67C2, &polaris10_device_info }, /* Polaris10 */ - { 0x67C4, &polaris10_device_info }, /* Polaris10 */ - { 0x67C7, &polaris10_device_info }, /* Polaris10 */ - { 0x67C8, &polaris10_device_info }, /* Polaris10 */ - { 0x67C9, &polaris10_device_info }, /* Polaris10 */ - { 0x67CA, &polaris10_device_info }, /* Polaris10 */ - { 0x67CC, &polaris10_device_info }, /* Polaris10 */ - { 0x67CF, &polaris10_device_info }, /* Polaris10 */ - { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ - { 0x67DF, &polaris10_device_info }, /* Polaris10 */ - { 0x6FDF, &polaris10_device_info }, /* Polaris10 */ - { 0x67E0, &polaris11_device_info }, /* Polaris11 */ - { 0x67E1, &polaris11_device_info }, /* Polaris11 */ - { 0x67E3, &polaris11_device_info }, /* Polaris11 */ - { 0x67E7, &polaris11_device_info }, /* Polaris11 */ - { 0x67E8, &polaris11_device_info }, /* Polaris11 */ - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ - { 0x67EF, &polaris11_device_info }, /* Polaris11 */ - { 0x67FF, &polaris11_device_info }, /* Polaris11 */ - { 0x6980, &polaris12_device_info }, /* Polaris12 */ - { 0x6981, &polaris12_device_info }, /* Polaris12 */ - { 0x6985, &polaris12_device_info }, /* Polaris12 */ - { 0x6986, &polaris12_device_info }, /* Polaris12 */ - { 0x6987, &polaris12_device_info }, /* Polaris12 */ - { 0x6995, &polaris12_device_info }, /* Polaris12 */ - { 0x6997, &polaris12_device_info }, /* Polaris12 */ - { 0x699F, &polaris12_device_info }, /* Polaris12 */ - { 0x694C, &vegam_device_info }, /* VegaM */ - { 0x694E, &vegam_device_info }, /* VegaM */ - { 0x694F, &vegam_device_info }, /* VegaM */ - { 0x6860, &vega10_device_info }, /* Vega10 */ - { 0x6861, &vega10_device_info }, /* Vega10 */ - { 0x6862, &vega10_device_info }, /* Vega10 */ - { 0x6863, &vega10_device_info }, /* Vega10 */ - { 0x6864, &vega10_device_info }, /* Vega10 */ - { 0x6867, &vega10_device_info }, /* Vega10 */ - { 0x6868, &vega10_device_info }, /* Vega10 */ - { 0x6869, &vega10_device_info }, /* Vega10 */ - { 0x686A, &vega10_device_info }, /* Vega10 */ - { 0x686B, &vega10_device_info }, /* Vega10 */ - { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ - { 0x686D, &vega10_device_info }, /* Vega10 */ - { 0x686E, &vega10_device_info }, /* Vega10 */ - { 0x686F, &vega10_device_info }, /* Vega10 */ - { 0x687F, &vega10_device_info }, /* Vega10 */ - { 0x69A0, &vega12_device_info }, /* Vega12 */ - { 0x69A1, &vega12_device_info }, /* Vega12 */ - { 0x69A2, &vega12_device_info }, /* Vega12 */ - { 0x69A3, &vega12_device_info }, /* Vega12 */ - { 0x69AF, &vega12_device_info }, /* Vega12 */ - { 0x66a0, &vega20_device_info }, /* Vega20 */ - { 0x66a1, &vega20_device_info }, /* Vega20 */ - { 0x66a2, &vega20_device_info }, /* Vega20 */ - { 0x66a3, &vega20_device_info }, /* Vega20 */ - { 0x66a4, &vega20_device_info }, /* Vega20 */ - { 0x66a7, &vega20_device_info }, /* Vega20 */ - { 0x66af, &vega20_device_info }, /* Vega20 */ - { 0x738C, &arcturus_device_info }, /* Arcturus */ - { 0x7388, &arcturus_device_info }, /* Arcturus */ - { 0x738E, &arcturus_device_info }, /* Arcturus */ - { 0x7390, &arcturus_device_info }, /* Arcturus vf */ - { 0x7310, &navi10_device_info }, /* Navi10 */ - { 0x7312, &navi10_device_info }, /* Navi10 */ - { 0x7318, &navi10_device_info }, /* Navi10 */ - { 0x731a, &navi10_device_info }, /* Navi10 */ - { 0x731f, &navi10_device_info }, /* Navi10 */ +/* For each entry, [0] is regular and [1] is virtualisation device. */ +static const struct kfd_device_info *kfd_supported_devices[][2] = { + [CHIP_KAVERI] = {&kaveri_device_info, NULL}, + [CHIP_HAWAII] = {&hawaii_device_info, NULL}, + [CHIP_TONGA] = {&tonga_device_info, NULL}, + [CHIP_FIJI] = {&fiji_device_info, &fiji_vf_device_info}, + [CHIP_CARRIZO] = {&carrizo_device_info, NULL}, + [CHIP_POLARIS10] = {&polaris10_device_info, &polaris10_vf_device_info}, + [CHIP_POLARIS11] = {&polaris11_device_info, NULL}, + [CHIP_POLARIS12] = {&polaris12_device_info, NULL}, + [CHIP_VEGAM] = {&vegam_device_info, NULL}, + [CHIP_VEGA10] = {&vega10_device_info, &vega10_vf_device_info}, + [CHIP_VEGA12] = {&vega12_device_info, NULL}, + [CHIP_VEGA20] = {&vega20_device_info, NULL}, + [CHIP_RAVEN] = {&raven_device_info, NULL}, + [CHIP_ARCTURUS] = {&arcturus_device_info, &arcturus_device_info}, + [CHIP_NAVI10] = {&navi10_device_info, NULL}, }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -504,32 +394,23 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd); static int kfd_resume(struct kfd_dev *kfd); -static const struct kfd_device_info *lookup_device_info(unsigned short did) +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, const struct kfd2kgd_calls *f2g, + unsigned int asic_type, bool vf) { - size_t i; + struct kfd_dev *kfd; + const struct kfd_device_info *device_info; - for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { - if (supported_devices[i].did == did) { - WARN_ON(!supported_devices[i].device_info); - return supported_devices[i].device_info; - } + if (asic_type >= sizeof(kfd_supported_devices) / (sizeof(void *) * 2)) { + dev_err(kfd_device, "asic_type %d out of range\n", asic_type); + return NULL; /* asic_type out of range */ } - dev_warn(kfd_device, "DID %04x is missing in supported_devices\n", - did); - - return NULL; -} - -struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, - struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) -{ - struct kfd_dev *kfd; - const struct kfd_device_info *device_info = - lookup_device_info(pdev->device); + device_info = kfd_supported_devices[asic_type][vf]; if (!device_info) { - dev_err(kfd_device, "kgd2kfd_probe failed\n"); + dev_err(kfd_device, "%s %s not supported in kfd\n", + amdgpu_asic_name[asic_type], vf ? "VF" : ""); return NULL; } diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h index 296aab724677..8d79940cb602 100644 --- a/include/drm/amd_asic_type.h +++ b/include/drm/amd_asic_type.h @@ -57,4 +57,6 @@ enum amd_asic_type { CHIP_LAST, }; +extern const char *amdgpu_asic_name[]; + #endif /*__AMD_ASIC_TYPE_H__ */ -- cgit v1.2.3 From 4e66d7d2156243bf6866fc83cda0c4b115627e6d Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 30 Aug 2019 18:09:10 -0400 Subject: drm/amdgpu: Add a kernel parameter for specifying the asic type As more and more new asics start to reuse the old device IDs before launch, there is a need to quickly override the existing asic type corresponding to the reused device ID through a kernel parameter. With this, engineers no longer need to rely on local hack patches, facilitating cooperation across teams. Signed-off-by: Yong Zhao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 ++++++ include/drm/amd_asic_type.h | 54 +++++++++++++++--------------- 4 files changed, 45 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dd9e18f155f0..7723f4ed0130 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -168,6 +168,7 @@ extern int amdgpu_mcbp; extern int amdgpu_discovery; extern int amdgpu_mes; extern int amdgpu_noretry; +extern int amdgpu_force_asic_type; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3268291babf8..b09237b6531c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2585,7 +2585,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->ddev = ddev; adev->pdev = pdev; adev->flags = flags; - adev->asic_type = flags & AMD_ASIC_MASK; + + if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) + adev->asic_type = amdgpu_force_asic_type; + else + adev->asic_type = flags & AMD_ASIC_MASK; + adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; if (amdgpu_emu_mode == 1) adev->usec_timeout *= 2; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 62fe102ed39e..0f0d6fb6de32 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -146,6 +146,7 @@ int amdgpu_mcbp = 0; int amdgpu_discovery = -1; int amdgpu_mes = 0; int amdgpu_noretry = 1; +int amdgpu_force_asic_type = -1; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -616,6 +617,16 @@ MODULE_PARM_DESC(noretry, "Disable retry faults (0 = retry enabled, 1 = retry disabled (default))"); module_param_named(noretry, amdgpu_noretry, int, 0644); +/** + * DOC: force_asic_type (int) + * A non negative value used to specify the asic type for all supported GPUs. + */ +MODULE_PARM_DESC(force_asic_type, + "A non negative value used to specify the asic type for all supported GPUs"); +module_param_named(force_asic_type, amdgpu_force_asic_type, int, 0444); + + + #ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h index 8d79940cb602..b1230e33d506 100644 --- a/include/drm/amd_asic_type.h +++ b/include/drm/amd_asic_type.h @@ -27,33 +27,33 @@ */ enum amd_asic_type { CHIP_TAHITI = 0, - CHIP_PITCAIRN, - CHIP_VERDE, - CHIP_OLAND, - CHIP_HAINAN, - CHIP_BONAIRE, - CHIP_KAVERI, - CHIP_KABINI, - CHIP_HAWAII, - CHIP_MULLINS, - CHIP_TOPAZ, - CHIP_TONGA, - CHIP_FIJI, - CHIP_CARRIZO, - CHIP_STONEY, - CHIP_POLARIS10, - CHIP_POLARIS11, - CHIP_POLARIS12, - CHIP_VEGAM, - CHIP_VEGA10, - CHIP_VEGA12, - CHIP_VEGA20, - CHIP_RAVEN, - CHIP_ARCTURUS, - CHIP_RENOIR, - CHIP_NAVI10, - CHIP_NAVI14, - CHIP_NAVI12, + CHIP_PITCAIRN, /* 1 */ + CHIP_VERDE, /* 2 */ + CHIP_OLAND, /* 3 */ + CHIP_HAINAN, /* 4 */ + CHIP_BONAIRE, /* 5 */ + CHIP_KAVERI, /* 6 */ + CHIP_KABINI, /* 7 */ + CHIP_HAWAII, /* 8 */ + CHIP_MULLINS, /* 9 */ + CHIP_TOPAZ, /* 10 */ + CHIP_TONGA, /* 11 */ + CHIP_FIJI, /* 12 */ + CHIP_CARRIZO, /* 13 */ + CHIP_STONEY, /* 14 */ + CHIP_POLARIS10, /* 15 */ + CHIP_POLARIS11, /* 16 */ + CHIP_POLARIS12, /* 17 */ + CHIP_VEGAM, /* 18 */ + CHIP_VEGA10, /* 19 */ + CHIP_VEGA12, /* 20 */ + CHIP_VEGA20, /* 21 */ + CHIP_RAVEN, /* 22 */ + CHIP_ARCTURUS, /* 23 */ + CHIP_RENOIR, /* 24 */ + CHIP_NAVI10, /* 25 */ + CHIP_NAVI14, /* 26 */ + CHIP_NAVI12, /* 27 */ CHIP_LAST, }; -- cgit v1.2.3 From 82af5b6609673f1cc7d3453d0be3d292dfffc813 Mon Sep 17 00:00:00 2001 From: Nayna Jain Date: Tue, 1 Oct 2019 19:37:18 -0400 Subject: sysfs: Fixes __BIN_ATTR_WO() macro This patch fixes the size and write parameter for the macro __BIN_ATTR_WO(). Fixes: 7f905761e15a8 ("sysfs: add BIN_ATTR_WO() macro") Signed-off-by: Nayna Jain Link: https://lore.kernel.org/r/1569973038-2710-1-git-send-email-nayna@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 5420817ed317..fa7ee503fb76 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -196,9 +196,9 @@ struct bin_attribute { .size = _size, \ } -#define __BIN_ATTR_WO(_name) { \ +#define __BIN_ATTR_WO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ - .store = _name##_store, \ + .write = _name##_write, \ .size = _size, \ } -- cgit v1.2.3 From 30945d31e5761436d9eba6b8cff468a5f7c9c266 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 24 Sep 2019 14:49:43 +0200 Subject: hwmon: Fix HWMON_P_MIN_ALARM mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both HWMON_P_MIN_ALARM and HWMON_P_MAX_ALARM were using BIT(hwmon_power_max_alarm). Fixes: aa7f29b07c870 ("hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm") CC: Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20190924124945.491326-2-nuno.sa@analog.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 04c36b7a61dd..72579168189d 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -235,7 +235,7 @@ enum hwmon_power_attributes { #define HWMON_P_LABEL BIT(hwmon_power_label) #define HWMON_P_ALARM BIT(hwmon_power_alarm) #define HWMON_P_CAP_ALARM BIT(hwmon_power_cap_alarm) -#define HWMON_P_MIN_ALARM BIT(hwmon_power_max_alarm) +#define HWMON_P_MIN_ALARM BIT(hwmon_power_min_alarm) #define HWMON_P_MAX_ALARM BIT(hwmon_power_max_alarm) #define HWMON_P_LCRIT_ALARM BIT(hwmon_power_lcrit_alarm) #define HWMON_P_CRIT_ALARM BIT(hwmon_power_crit_alarm) -- cgit v1.2.3 From cf21e76a6005016474623a375ae2e90364a02168 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Tue, 24 Sep 2019 17:53:25 -0400 Subject: drm/amdgpu: return tcc_disabled_mask to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UMDs need this for correct programming of harvested chips. Signed-off-by: Marek Olšák Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 12 ++++++++++++ include/uapi/drm/amdgpu_drm.h | 2 ++ 5 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 9f2f54e5e06d..2ef72309e5ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -82,9 +82,10 @@ * - 3.32.0 - Add syncobj timeline support to AMDGPU_CS. * - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS. * - 3.34.0 - Non-DC can flip correctly between buffers with different pitches + * - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 34 +#define KMS_DRIVER_MINOR 35 #define KMS_DRIVER_PATCHLEVEL 0 #define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index c5179a807a04..35eff9e6ce16 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -165,6 +165,7 @@ struct amdgpu_gfx_config { uint32_t num_sc_per_sh; uint32_t num_packer_per_sc; uint32_t pa_sc_tile_steering_override; + uint64_t tcc_disabled_mask; }; struct amdgpu_cu_info { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 7f4e000a400f..6746967f5e80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -782,6 +782,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file dev_info.pa_sc_tile_steering_override = adev->gfx.config.pa_sc_tile_steering_override; + dev_info.tcc_disabled_mask = adev->gfx.config.tcc_disabled_mask; + return copy_to_user(out, &dev_info, min((size_t)size, sizeof(dev_info))) ? -EFAULT : 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index cfc0952f6175..0fcea94f0a4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -1691,6 +1691,17 @@ static void gfx_v10_0_tcp_harvest(struct amdgpu_device *adev) } } +static void gfx_v10_0_get_tcc_info(struct amdgpu_device *adev) +{ + /* TCCs are global (not instanced). */ + uint32_t tcc_disable = RREG32_SOC15(GC, 0, mmCGTS_TCC_DISABLE) | + RREG32_SOC15(GC, 0, mmCGTS_USER_TCC_DISABLE); + + adev->gfx.config.tcc_disabled_mask = + REG_GET_FIELD(tcc_disable, CGTS_TCC_DISABLE, TCC_DISABLE) | + (REG_GET_FIELD(tcc_disable, CGTS_TCC_DISABLE, HI_TCC_DISABLE) << 16); +} + static void gfx_v10_0_constants_init(struct amdgpu_device *adev) { u32 tmp; @@ -1702,6 +1713,7 @@ static void gfx_v10_0_constants_init(struct amdgpu_device *adev) gfx_v10_0_setup_rb(adev); gfx_v10_0_get_cu_info(adev, &adev->gfx.cu_info); + gfx_v10_0_get_tcc_info(adev); adev->gfx.config.pa_sc_tile_steering_override = gfx_v10_0_init_pa_sc_tile_steering_override(adev); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 640e5356fd65..bbdad866e3fe 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -1005,6 +1005,8 @@ struct drm_amdgpu_info_device { __u64 high_va_max; /* gfx10 pa_sc_tile_steering_override */ __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; }; struct drm_amdgpu_info_hw_ip { -- cgit v1.2.3 From 8f8fed0cdbbd6cdbf28d9ebe662f45765d2f7d39 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 1 Oct 2019 16:48:39 +0900 Subject: scsi: core: save/restore command resid for error handling When a non-passthrough command is terminated with CHECK CONDITION, request sense is executed by hijacking the command descriptor. Since scsi_eh_prep_cmnd() and scsi_eh_restore_cmnd() do not save/restore the original command resid, the value returned on failure of the original command is lost and replaced with the value set by the execution of the request sense command. This value may in many instances be unaligned to the device sector size, causing sd_done() to print a warning message about the incorrect unaligned resid before the command is retried. Fix this problem by saving the original command residual in struct scsi_eh_save using scsi_eh_prep_cmnd() and restoring it in scsi_eh_restore_cmnd(). In addition, to make sure that the request sense command is executed with a correctly initialized command structure, also reset the residual to 0 in scsi_eh_prep_cmnd() after saving the original command value in struct scsi_eh_save. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191001074839.1994-1-damien.lemoal@wdc.com Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 3 +++ include/scsi/scsi_eh.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 1c470e31ae81..ae2fa170f6ad 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -967,6 +967,7 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, ses->data_direction = scmd->sc_data_direction; ses->sdb = scmd->sdb; ses->result = scmd->result; + ses->resid_len = scmd->req.resid_len; ses->underflow = scmd->underflow; ses->prot_op = scmd->prot_op; ses->eh_eflags = scmd->eh_eflags; @@ -977,6 +978,7 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, memset(scmd->cmnd, 0, BLK_MAX_CDB); memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->result = 0; + scmd->req.resid_len = 0; if (sense_bytes) { scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, @@ -1029,6 +1031,7 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) scmd->sc_data_direction = ses->data_direction; scmd->sdb = ses->sdb; scmd->result = ses->result; + scmd->req.resid_len = ses->resid_len; scmd->underflow = ses->underflow; scmd->prot_op = ses->prot_op; scmd->eh_eflags = ses->eh_eflags; diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h index 3810b340551c..6bd5ed695a5e 100644 --- a/include/scsi/scsi_eh.h +++ b/include/scsi/scsi_eh.h @@ -32,6 +32,7 @@ extern int scsi_ioctl_reset(struct scsi_device *, int __user *); struct scsi_eh_save { /* saved state */ int result; + unsigned int resid_len; int eh_eflags; enum dma_data_direction data_direction; unsigned underflow; -- cgit v1.2.3 From 47934ef7f1883209120781b59d78eaf8b83e2fb7 Mon Sep 17 00:00:00 2001 From: Stefan-gabriel Mirea Date: Fri, 4 Oct 2019 13:51:13 +0000 Subject: tty: serial: Fix PORT_LINFLEXUART definition The port type macros should have different values for different devices. Currently, PORT_LINFLEXUART conflicts with PORT_SUNIX. Fixes: 09864c1cdf5c ("tty: serial: Add linflexuart driver for S32V234") Signed-off-by: Stefan-Gabriel Mirea Link: https://lore.kernel.org/r/20191004135058.18007-1-stefan-gabriel.mirea@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 0f4f87a6fd54..e7fe550b6038 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -291,6 +291,6 @@ #define PORT_SUNIX 121 /* Freescale Linflex UART */ -#define PORT_LINFLEXUART 121 +#define PORT_LINFLEXUART 122 #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3 From 130bce3afbbbbe585cba8604f2124c28e8d86fb0 Mon Sep 17 00:00:00 2001 From: Rander Wang Date: Mon, 30 Sep 2019 09:29:45 -0500 Subject: ALSA: hdac: clear link output stream mapping Fix potential DMA hang upon starting playback on devices in HDA mode on Intel platforms (Gemini Lake/Whiskey Lake/Comet Lake/Ice Lake). It doesn't affect platforms before Gemini Lake or any Intel device in non-HDA mode. The reset value for the LOSDIV register is all output streams valid. Clear this register to invalidate non-existent streams when the bus is powered up. Signed-off-by: Rander Wang Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20190930142945.7805-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- include/sound/hda_register.h | 3 +++ sound/hda/ext/hdac_ext_controller.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/sound/hda_register.h b/include/sound/hda_register.h index 0fd39295b426..057d2a2d0bd0 100644 --- a/include/sound/hda_register.h +++ b/include/sound/hda_register.h @@ -264,6 +264,9 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 }; #define AZX_REG_ML_LOUTPAY 0x20 #define AZX_REG_ML_LINPAY 0x30 +/* bit0 is reserved, with BIT(1) mapping to stream1 */ +#define ML_LOSIDV_STREAM_MASK 0xFFFE + #define ML_LCTL_SCF_MASK 0xF #define AZX_MLCTL_SPA (0x1 << 16) #define AZX_MLCTL_CPA (0x1 << 23) diff --git a/sound/hda/ext/hdac_ext_controller.c b/sound/hda/ext/hdac_ext_controller.c index 211ca85acd8c..cfab60d88c92 100644 --- a/sound/hda/ext/hdac_ext_controller.c +++ b/sound/hda/ext/hdac_ext_controller.c @@ -270,6 +270,11 @@ int snd_hdac_ext_bus_link_get(struct hdac_bus *bus, ret = snd_hdac_ext_bus_link_power_up(link); + /* + * clear the register to invalidate all the output streams + */ + snd_hdac_updatew(link->ml_addr, AZX_REG_ML_LOSIDV, + ML_LOSIDV_STREAM_MASK, 0); /* * wait for 521usec for codec to report status * HDA spec section 4.3 - Codec Discovery -- cgit v1.2.3 From 55f6c98e3674ce16038a1949c3f9ca5a9a99f289 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put peer record rxrpc_put_peer() calls trace_rxrpc_peer() after it has done the decrement of the refcount - which looks at the debug_id in the peer record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. This can cause the following symptoms: BUG: KASAN: use-after-free in __rxrpc_put_peer net/rxrpc/peer_object.c:411 [inline] BUG: KASAN: use-after-free in rxrpc_put_peer+0x685/0x6a0 net/rxrpc/peer_object.c:435 Read of size 8 at addr ffff888097ec0058 by task syz-executor823/24216 Fixes: 1159d4b496f5 ("rxrpc: Add a tracepoint to track rxrpc_peer refcounting") Reported-by: syzbot+b9be979c55f2bea8ed30@syzkaller.appspotmail.com Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- net/rxrpc/peer_object.c | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index edc5c887a44c..45556fe771c3 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -519,10 +519,10 @@ TRACE_EVENT(rxrpc_local, ); TRACE_EVENT(rxrpc_peer, - TP_PROTO(struct rxrpc_peer *peer, enum rxrpc_peer_trace op, + TP_PROTO(unsigned int peer_debug_id, enum rxrpc_peer_trace op, int usage, const void *where), - TP_ARGS(peer, op, usage, where), + TP_ARGS(peer_debug_id, op, usage, where), TP_STRUCT__entry( __field(unsigned int, peer ) @@ -532,7 +532,7 @@ TRACE_EVENT(rxrpc_peer, ), TP_fast_assign( - __entry->peer = peer->debug_id; + __entry->peer = peer_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 9c3ac96f71cb..b700b7ecaa3d 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -382,7 +382,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) int n; n = atomic_inc_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_got, n, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n, here); return peer; } @@ -396,7 +396,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) if (peer) { int n = atomic_fetch_add_unless(&peer->usage, 1, 0); if (n > 0) - trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n + 1, here); else peer = NULL; } @@ -426,11 +426,13 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id; int n; if (peer) { + debug_id = peer->debug_id; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) __rxrpc_put_peer(peer); } @@ -443,10 +445,11 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer_locked(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id = peer->debug_id; int n; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); -- cgit v1.2.3 From 4c1295dccc0afe0905b6ca4c62ade7f2406f2cfb Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put connection record rxrpc_put_*conn() calls trace_rxrpc_conn() after they have done the decrement of the refcount - which looks at the debug_id in the connection record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. Fixes: 363deeab6d0f ("rxrpc: Add connection tracepoint and client conn state tracepoint") Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- net/rxrpc/call_accept.c | 2 +- net/rxrpc/conn_client.c | 6 ++++-- net/rxrpc/conn_object.c | 13 +++++++------ net/rxrpc/conn_service.c | 2 +- 5 files changed, 16 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 45556fe771c3..38a97e890cb6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -546,10 +546,10 @@ TRACE_EVENT(rxrpc_peer, ); TRACE_EVENT(rxrpc_conn, - TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op, + TP_PROTO(unsigned int conn_debug_id, enum rxrpc_conn_trace op, int usage, const void *where), - TP_ARGS(conn, op, usage, where), + TP_ARGS(conn_debug_id, op, usage, where), TP_STRUCT__entry( __field(unsigned int, conn ) @@ -559,7 +559,7 @@ TRACE_EVENT(rxrpc_conn, ), TP_fast_assign( - __entry->conn = conn->debug_id; + __entry->conn = conn_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 00c095d74145..c1b1b7dd2924 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -84,7 +84,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), here); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3f1da1b49f69..700eb77173fc 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -212,7 +212,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); - trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, + atomic_read(&conn->usage), __builtin_return_address(0)); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); @@ -985,11 +986,12 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) void rxrpc_put_client_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; do { n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here); if (n > 0) return; ASSERTCMP(n, >=, 0); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ed05b6922132..38d718e90dc6 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -269,7 +269,7 @@ bool rxrpc_queue_conn(struct rxrpc_connection *conn) if (n == 0) return false; if (rxrpc_queue_work(&conn->processor)) - trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, n + 1, here); else rxrpc_put_connection(conn); return true; @@ -284,7 +284,7 @@ void rxrpc_see_connection(struct rxrpc_connection *conn) if (conn) { int n = atomic_read(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here); } } @@ -296,7 +296,7 @@ void rxrpc_get_connection(struct rxrpc_connection *conn) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here); } /* @@ -310,7 +310,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) if (conn) { int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n > 0) - trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n + 1, here); else conn = NULL; } @@ -333,10 +333,11 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, void rxrpc_put_service_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); if (n == 1) rxrpc_set_service_reap_timer(conn->params.local->rxnet, @@ -420,7 +421,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; - trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL); if (rxrpc_conn_is_client(conn)) BUG(); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index b30e13f6d95f..123d6ceab15c 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -134,7 +134,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), __builtin_return_address(0)); } -- cgit v1.2.3 From 48c9e0ec7cbbb7370448f859ccc8e3b7eb69e755 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put call record rxrpc_put_call() calls trace_rxrpc_call() after it has done the decrement of the refcount - which looks at the debug_id in the call record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. Fixes: e34d4234b0b7 ("rxrpc: Trace rxrpc_call usage") Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_object.c | 28 +++++++++++++++++----------- 3 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 38a97e890cb6..191fe447f990 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -606,10 +606,10 @@ TRACE_EVENT(rxrpc_client, ); TRACE_EVENT(rxrpc_call, - TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op, + TP_PROTO(unsigned int call_debug_id, enum rxrpc_call_trace op, int usage, const void *where, const void *aux), - TP_ARGS(call, op, usage, where, aux), + TP_ARGS(call_debug_id, op, usage, where, aux), TP_STRUCT__entry( __field(unsigned int, call ) @@ -620,7 +620,7 @@ TRACE_EVENT(rxrpc_call, ), TP_fast_assign( - __entry->call = call->debug_id; + __entry->call = call_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index c1b1b7dd2924..1f778102ed8d 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -97,7 +97,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->flags |= (1 << RXRPC_CALL_IS_SERVICE); call->state = RXRPC_CALL_SERVER_PREALLOC; - trace_rxrpc_call(call, rxrpc_call_new_service, + trace_rxrpc_call(call->debug_id, rxrpc_call_new_service, atomic_read(&call->usage), here, (const void *)user_call_ID); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 32d8dc677142..6dace078971a 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -240,7 +240,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (p->intr) __set_bit(RXRPC_CALL_IS_INTR, &call->flags); call->tx_total_len = p->tx_total_len; - trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, + atomic_read(&call->usage), here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we @@ -290,8 +291,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error; - trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), - here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_connected, + atomic_read(&call->usage), here, NULL); rxrpc_start_call_timer(call); @@ -313,8 +314,8 @@ error_dup_user_ID: error: __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, ret); - trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), - here, ERR_PTR(ret)); + trace_rxrpc_call(call->debug_id, rxrpc_call_error, + atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); @@ -376,7 +377,8 @@ bool rxrpc_queue_call(struct rxrpc_call *call) if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -391,7 +393,8 @@ bool __rxrpc_queue_call(struct rxrpc_call *call) int n = atomic_read(&call->usage); ASSERTCMP(n, >=, 1); if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -406,7 +409,8 @@ void rxrpc_see_call(struct rxrpc_call *call) if (call) { int n = atomic_read(&call->usage); - trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n, + here, NULL); } } @@ -418,7 +422,7 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(call->debug_id, op, n, here, NULL); } /* @@ -445,7 +449,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); - trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_release, + atomic_read(&call->usage), here, (const void *)call->flags); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); @@ -534,12 +539,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { struct rxrpc_net *rxnet = call->rxnet; const void *here = __builtin_return_address(0); + unsigned int debug_id = call->debug_id; int n; ASSERT(call != NULL); n = atomic_dec_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(debug_id, op, n, here, NULL); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); -- cgit v1.2.3 From f1da567f1dc1b55d178b8f2d0cfe8353858aac19 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 5 Oct 2019 23:04:47 +0200 Subject: driver core: platform: Add platform_get_irq_byname_optional() Some drivers (e.g dwc3) first try to get an IRQ byname and then fall back to the one at index 0. In this case we do not want the error(s) printed by platform_get_irq_byname(). This commit adds a new platform_get_irq_byname_optional(), which does not print errors, for this. While at it also improve the kdoc text for platform_get_irq_byname() a bit. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205037 Signed-off-by: Hans de Goede Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20191005210449.3926-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 46 ++++++++++++++++++++++++++++++++++------- include/linux/platform_device.h | 2 ++ 2 files changed, 41 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index b6c6c7d97d5b..b230beb6ccb4 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -241,12 +241,8 @@ struct resource *platform_get_resource_byname(struct platform_device *dev, } EXPORT_SYMBOL_GPL(platform_get_resource_byname); -/** - * platform_get_irq_byname - get an IRQ for a device by name - * @dev: platform device - * @name: IRQ name - */ -int platform_get_irq_byname(struct platform_device *dev, const char *name) +static int __platform_get_irq_byname(struct platform_device *dev, + const char *name) { struct resource *r; @@ -262,11 +258,47 @@ int platform_get_irq_byname(struct platform_device *dev, const char *name) if (r) return r->start; - dev_err(&dev->dev, "IRQ %s not found\n", name); return -ENXIO; } + +/** + * platform_get_irq_byname - get an IRQ for a device by name + * @dev: platform device + * @name: IRQ name + * + * Get an IRQ like platform_get_irq(), but then by name rather then by index. + * + * Return: IRQ number on success, negative error number on failure. + */ +int platform_get_irq_byname(struct platform_device *dev, const char *name) +{ + int ret; + + ret = __platform_get_irq_byname(dev, name); + if (ret < 0 && ret != -EPROBE_DEFER) + dev_err(&dev->dev, "IRQ %s not found\n", name); + + return ret; +} EXPORT_SYMBOL_GPL(platform_get_irq_byname); +/** + * platform_get_irq_byname_optional - get an optional IRQ for a device by name + * @dev: platform device + * @name: IRQ name + * + * Get an optional IRQ by name like platform_get_irq_byname(). Except that it + * does not print an error message if an IRQ can not be obtained. + * + * Return: IRQ number on success, negative error number on failure. + */ +int platform_get_irq_byname_optional(struct platform_device *dev, + const char *name) +{ + return __platform_get_irq_byname(dev, name); +} +EXPORT_SYMBOL_GPL(platform_get_irq_byname_optional); + /** * platform_add_devices - add a numbers of platform devices * @devs: array of platform devices to add diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 1b5cec067533..f2688404d1cd 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -64,6 +64,8 @@ extern struct resource *platform_get_resource_byname(struct platform_device *, unsigned int, const char *); extern int platform_get_irq_byname(struct platform_device *, const char *); +extern int platform_get_irq_byname_optional(struct platform_device *dev, + const char *name); extern int platform_add_devices(struct platform_device **, int); struct platform_device_info { -- cgit v1.2.3 From 047d50aee341d940350897c85799e56ae57c3849 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Wed, 2 Oct 2019 18:59:00 +0200 Subject: efi/tpm: Don't access event->count when it isn't mapped Some machines generate a lot of event log entries. When we're iterating over them, the code removes the old mapping and adds a new one, so once we cross the page boundary we're unmapping the page with the count on it. Hilarity ensues. This patch keeps the info from the header in local variables so we don't need to access that page again or keep track of if it's mapped. Tested-by: Lyude Paul Signed-off-by: Peter Jones Signed-off-by: Jarkko Sakkinen Signed-off-by: Ard Biesheuvel Reviewed-by: Jarkko Sakkinen Acked-by: Matthew Garrett Acked-by: Ard Biesheuvel Cc: Ben Dooks Cc: Dave Young Cc: Jerry Snitselaar Cc: Linus Torvalds Cc: Lukas Wunner Cc: Octavian Purdila Cc: Peter Zijlstra Cc: Scott Talbert Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-integrity@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations") Link: https://lkml.kernel.org/r/20191002165904.8819-4-ard.biesheuvel@linaro.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- include/linux/tpm_eventlog.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 63238c84dc0b..b50cc3adca18 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -170,6 +170,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, u16 halg; int i; int j; + u32 count, event_type; marker = event; marker_start = marker; @@ -190,16 +191,22 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, } event = (struct tcg_pcr_event2_head *)mapping; + /* + * The loop below will unmap these fields if the log is larger than + * one page, so save them here for reference: + */ + count = READ_ONCE(event->count); + event_type = READ_ONCE(event->event_type); efispecid = (struct tcg_efi_specid_event_head *)event_header->event; /* Check if event is malformed. */ - if (event->count > efispecid->num_algs) { + if (count > efispecid->num_algs) { size = 0; goto out; } - for (i = 0; i < event->count; i++) { + for (i = 0; i < count; i++) { halg_size = sizeof(event->digests[i].alg_id); /* Map the digest's algorithm identifier */ @@ -256,8 +263,9 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, + event_field->event_size; size = marker - marker_start; - if ((event->event_type == 0) && (event_field->event_size == 0)) + if (event_type == 0 && event_field->event_size == 0) size = 0; + out: if (do_mapping) TPM_MEMUNMAP(mapping, mapping_size); -- cgit v1.2.3 From e658c82be5561412c5e83b5e74e9da4830593f3e Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Wed, 2 Oct 2019 18:59:02 +0200 Subject: efi/tpm: Only set 'efi_tpm_final_log_size' after successful event log parsing If __calc_tpm2_event_size() fails to parse an event it will return 0, resulting tpm2_calc_event_log_size() returning -1. Currently there is no check of this return value, and 'efi_tpm_final_log_size' can end up being set to this negative value resulting in a crash like this one: BUG: unable to handle page fault for address: ffffbc8fc00866ad #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page RIP: 0010:memcpy_erms+0x6/0x10 Call Trace: tpm_read_log_efi() tpm_bios_log_setup() tpm_chip_register() tpm_tis_core_init.cold.9+0x28c/0x466 tpm_tis_plat_probe() platform_drv_probe() ... Also __calc_tpm2_event_size() returns a size of 0 when it fails to parse an event, so update function documentation to reflect this. The root cause of the issue that caused the failure of event parsing in this case is resolved by Peter Jone's patchset dealing with large event logs where crossing over a page boundary causes the page with the event count to be unmapped. Signed-off-by: Jerry Snitselaar Signed-off-by: Ard Biesheuvel Cc: Ben Dooks Cc: Dave Young Cc: Jarkko Sakkinen Cc: Linus Torvalds Cc: Lukas Wunner Cc: Lyude Paul Cc: Matthew Garrett Cc: Octavian Purdila Cc: Peter Jones Cc: Peter Zijlstra Cc: Scott Talbert Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-integrity@vger.kernel.org Cc: stable@vger.kernel.org Fixes: c46f3405692de ("tpm: Reserve the TPM final events table") Link: https://lkml.kernel.org/r/20191002165904.8819-6-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- drivers/firmware/efi/tpm.c | 9 ++++++++- include/linux/tpm_eventlog.h | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c index b9ae5c6f9b9c..703469c1ab8e 100644 --- a/drivers/firmware/efi/tpm.c +++ b/drivers/firmware/efi/tpm.c @@ -85,11 +85,18 @@ int __init efi_tpm_eventlog_init(void) final_tbl->nr_events, log_tbl->log); } + + if (tbl_size < 0) { + pr_err(FW_BUG "Failed to parse event in TPM Final Events Log\n"); + goto out_calc; + } + memblock_reserve((unsigned long)final_tbl, tbl_size + sizeof(*final_tbl)); - early_memunmap(final_tbl, sizeof(*final_tbl)); efi_tpm_final_log_size = tbl_size; +out_calc: + early_memunmap(final_tbl, sizeof(*final_tbl)); out: early_memunmap(log_tbl, sizeof(*log_tbl)); return ret; diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index b50cc3adca18..131ea1bad458 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -152,7 +152,7 @@ struct tcg_algorithm_info { * total. Once we've done this we know the offset of the data length field, * and can calculate the total size of the event. * - * Return: size of the event on success, <0 on failure + * Return: size of the event on success, 0 on failure */ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, -- cgit v1.2.3 From bf70b0503abd19194dba25fe383d143d0229dc6a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 16:58:21 +0900 Subject: module: swap the order of symbol.namespace Currently, EXPORT_SYMBOL_NS(_GPL) constructs the kernel symbol as follows: __ksymtab_SYMBOL.NAMESPACE The sym_extract_namespace() in modpost allocates memory for the part SYMBOL.NAMESPACE when '.' is contained. One problem is that the pointer returned by strdup() is lost because the symbol name will be copied to malloc'ed memory by alloc_symbol(). No one will keep track of the pointer of strdup'ed memory. sym->namespace still points to the NAMESPACE part. So, you can free it with complicated code like this: free(sym->namespace - strlen(sym->name) - 1); It complicates memory free. To fix it elegantly, I swapped the order of the symbol and the namespace as follows: __ksymtab_NAMESPACE.SYMBOL then, simplified sym_extract_namespace() so that it allocates memory only for the NAMESPACE part. I prefer this order because it is intuitive and also matches to major languages. For example, NAMESPACE::NAME in C++, MODULE.NAME in Python. Reviewed-by: Matthias Maennich Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/export.h | 4 ++-- scripts/mod/modpost.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 95f55b7f83a0..0695d4e847d9 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -52,7 +52,7 @@ extern struct module __this_module; __ADDRESSABLE(sym) \ asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ " .balign 4 \n" \ - "__ksymtab_" #sym NS_SEPARATOR #ns ": \n" \ + "__ksymtab_" #ns NS_SEPARATOR #sym ": \n" \ " .long " #sym "- . \n" \ " .long __kstrtab_" #sym "- . \n" \ " .long __kstrtab_ns_" #sym "- . \n" \ @@ -76,7 +76,7 @@ struct kernel_symbol { #else #define __KSYMTAB_ENTRY_NS(sym, sec, ns) \ static const struct kernel_symbol __ksymtab_##sym##__##ns \ - asm("__ksymtab_" #sym NS_SEPARATOR #ns) \ + asm("__ksymtab_" #ns NS_SEPARATOR #sym) \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ = { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 3961941e8e7a..c4b536ac1327 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -350,18 +350,16 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec) static const char *sym_extract_namespace(const char **symname) { - size_t n; - char *dupsymname; + char *namespace = NULL; + char *ns_separator; - n = strcspn(*symname, "."); - if (n < strlen(*symname) - 1) { - dupsymname = NOFAIL(strdup(*symname)); - dupsymname[n] = '\0'; - *symname = dupsymname; - return dupsymname + n + 1; + ns_separator = strchr(*symname, '.'); + if (ns_separator) { + namespace = NOFAIL(strndup(*symname, ns_separator - *symname)); + *symname = ns_separator + 1; } - return NULL; + return namespace; } /** -- cgit v1.2.3 From fa6643cdc5cd726b10d30eec45ff8dca267de735 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 16:58:23 +0900 Subject: module: rename __kstrtab_ns_* to __kstrtabns_* to avoid symbol conflict The module namespace produces __strtab_ns_ symbols to store namespace strings, but it does not guarantee the name uniqueness. This is a potential problem because we have exported symbols starting with "ns_". For example, kernel/capability.c exports the following symbols: EXPORT_SYMBOL(ns_capable); EXPORT_SYMBOL(capable); Assume a situation where those are converted as follows: EXPORT_SYMBOL_NS(ns_capable, some_namespace); EXPORT_SYMBOL_NS(capable, some_namespace); The former expands to "__kstrtab_ns_capable" and "__kstrtab_ns_ns_capable", and the latter to "__kstrtab_capable" and "__kstrtab_ns_capable". Then, we have the duplicated "__kstrtab_ns_capable". To ensure the uniqueness, rename "__kstrtab_ns_*" to "__kstrtabns_*". Reviewed-by: Matthias Maennich Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/export.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 0695d4e847d9..621158ecd2e2 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -55,7 +55,7 @@ extern struct module __this_module; "__ksymtab_" #ns NS_SEPARATOR #sym ": \n" \ " .long " #sym "- . \n" \ " .long __kstrtab_" #sym "- . \n" \ - " .long __kstrtab_ns_" #sym "- . \n" \ + " .long __kstrtabns_" #sym "- . \n" \ " .previous \n") #define __KSYMTAB_ENTRY(sym, sec) \ @@ -79,7 +79,7 @@ struct kernel_symbol { asm("__ksymtab_" #ns NS_SEPARATOR #sym) \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ - = { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym } + = { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym } #define __KSYMTAB_ENTRY(sym, sec) \ static const struct kernel_symbol __ksymtab_##sym \ @@ -112,7 +112,7 @@ struct kernel_symbol { /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL_NS(sym, sec, ns) \ ___export_symbol_common(sym, sec); \ - static const char __kstrtab_ns_##sym[] \ + static const char __kstrtabns_##sym[] \ __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #ns; \ __KSYMTAB_ENTRY_NS(sym, sec, ns) -- cgit v1.2.3 From c512c69187197fe08026cb5bbe7b9709f4f89b73 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 7 Oct 2019 12:56:48 -0700 Subject: uaccess: implement a proper unsafe_copy_to_user() and switch filldir over to it In commit 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") I made filldir() use unsafe_put_user(), which improves code generation on x86 enormously. But because we didn't have a "unsafe_copy_to_user()", the dirent name copy was also done by hand with unsafe_put_user() in a loop, and it turns out that a lot of other architectures didn't like that, because unlike x86, they have various alignment issues. Most non-x86 architectures trap and fix it up, and some (like xtensa) will just fail unaligned put_user() accesses unconditionally. Which makes that "copy using put_user() in a loop" not work for them at all. I could make that code do explicit alignment etc, but the architectures that don't like unaligned accesses also don't really use the fancy "user_access_begin/end()" model, so they might just use the regular old __copy_to_user() interface. So this commit takes that looping implementation, turns it into the x86 version of "unsafe_copy_to_user()", and makes other architectures implement the unsafe copy version as __copy_to_user() (the same way they do for the other unsafe_xyz() accessor functions). Note that it only does this for the copying _to_ user space, and we still don't have a unsafe version of copy_from_user(). That's partly because we have no current users of it, but also partly because the copy_from_user() case is slightly different and cannot efficiently be implemented in terms of a unsafe_get_user() loop (because gcc can't do asm goto with outputs). It would be trivial to do this using "rep movsb", which would work really nicely on newer x86 cores, but really badly on some older ones. Al Viro is looking at cleaning up all our user copy routines to make this all a non-issue, but for now we have this simple-but-stupid version for x86 that works fine for the dirent name copy case because those names are short strings and we simply don't need anything fancier. Fixes: 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") Reported-by: Guenter Roeck Reported-and-tested-by: Tony Luck Cc: Al Viro Cc: Max Filippov Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 23 ++++++++++++++++++++++ fs/readdir.c | 44 ++---------------------------------------- include/linux/uaccess.h | 6 ++++-- 3 files changed, 29 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c225ede0e4..61d93f062a36 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -734,5 +734,28 @@ do { \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)src,(type __user *)dst,label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/fs/readdir.c b/fs/readdir.c index 19bea591c3f1..6e2623e57b2e 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -27,53 +27,13 @@ /* * Note the "unsafe_put_user() semantics: we goto a * label for errors. - * - * Also note how we use a "while()" loop here, even though - * only the biggest size needs to loop. The compiler (well, - * at least gcc) is smart enough to turn the smaller sizes - * into just if-statements, and this way we don't need to - * care whether 'u64' or 'u32' is the biggest size. - */ -#define unsafe_copy_loop(dst, src, len, type, label) \ - while (len >= sizeof(type)) { \ - unsafe_put_user(get_unaligned((type *)src), \ - (type __user *)dst, label); \ - dst += sizeof(type); \ - src += sizeof(type); \ - len -= sizeof(type); \ - } - -/* - * We avoid doing 64-bit copies on 32-bit architectures. They - * might be better, but the component names are mostly small, - * and the 64-bit cases can end up being much more complex and - * put much more register pressure on the code, so it's likely - * not worth the pain of unaligned accesses etc. - * - * So limit the copies to "unsigned long" size. I did verify - * that at least the x86-32 case is ok without this limiting, - * but I worry about random other legacy 32-bit cases that - * might not do as well. - */ -#define unsafe_copy_type(dst, src, len, type, label) do { \ - if (sizeof(type) <= sizeof(unsigned long)) \ - unsafe_copy_loop(dst, src, len, type, label); \ -} while (0) - -/* - * Copy the dirent name to user space, and NUL-terminate - * it. This should not be a function call, since we're doing - * the copy inside a "user_access_begin/end()" section. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ char __user *dst = (_dst); \ const char *src = (_src); \ size_t len = (_len); \ - unsafe_copy_type(dst, src, len, u64, label); \ - unsafe_copy_type(dst, src, len, u32, label); \ - unsafe_copy_type(dst, src, len, u16, label); \ - unsafe_copy_type(dst, src, len, u8, label); \ - unsafe_put_user(0, dst, label); \ + unsafe_put_user(0, dst+len, label); \ + unsafe_copy_to_user(dst, src, len, label); \ } while (0) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e47d0522a1f4..d4ee6e942562 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) -#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) +#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) +#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) +#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) +#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif -- cgit v1.2.3 From dc0c18ed229cdcca283dd78fefa38273ec37a42c Mon Sep 17 00:00:00 2001 From: Aaron Komisar Date: Wed, 2 Oct 2019 13:59:07 +0000 Subject: mac80211: fix scan when operating on DFS channels in ETSI domains In non-ETSI regulatory domains scan is blocked when operating channel is a DFS channel. For ETSI, however, once DFS channel is marked as available after the CAC, this channel will remain available (for some time) even after leaving this channel. Therefore a scan can be done without any impact on the availability of the DFS channel as no new CAC is required after the scan. Enable scan in mac80211 in these cases. Signed-off-by: Aaron Komisar Link: https://lore.kernel.org/r/1570024728-17284-1-git-send-email-aaron.komisar@tandemg.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ net/mac80211/scan.c | 30 ++++++++++++++++++++++++++++-- net/wireless/reg.c | 1 + net/wireless/reg.h | 8 -------- 4 files changed, 37 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ff45c3e1abff..4ab2c49423dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5549,6 +5549,14 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, */ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); +/** + * regulatory_pre_cac_allowed - check if pre-CAC allowed in the current regdom + * @wiphy: wiphy for which pre-CAC capability is checked. + * + * Pre-CAC is allowed only in some regdomains (notable ETSI). + */ +bool regulatory_pre_cac_allowed(struct wiphy *wiphy); + /** * DOC: Internal regulatory db functions * diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index adf94ba1ed77..4d31d9688dc2 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -520,10 +520,33 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, return 0; } +static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *sdata_iter; + + if (!ieee80211_is_radar_required(local)) + return true; + + if (!regulatory_pre_cac_allowed(local->hw.wiphy)) + return false; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata_iter, &local->interfaces, list) { + if (sdata_iter->wdev.cac_started) { + mutex_unlock(&local->iflist_mtx); + return false; + } + } + mutex_unlock(&local->iflist_mtx); + + return true; +} + static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { - if (ieee80211_is_radar_required(local)) + if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) @@ -630,7 +653,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, lockdep_assert_held(&local->mtx); - if (local->scan_req || ieee80211_is_radar_required(local)) + if (local->scan_req) + return -EBUSY; + + if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 420c4207ab59..446c76d44e65 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3883,6 +3883,7 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy) return pre_cac_allowed; } +EXPORT_SYMBOL(regulatory_pre_cac_allowed); void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 504133d76de4..dc8f689bd469 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -155,14 +155,6 @@ bool regulatory_indoor_allowed(void); */ #define REG_PRE_CAC_EXPIRY_GRACE_MS 2000 -/** - * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain - * @wiphy: wiphy for which pre-CAC capability is checked. - - * Pre-CAC is allowed only in ETSI domain. - */ -bool regulatory_pre_cac_allowed(struct wiphy *wiphy); - /** * regulatory_propagate_dfs_state - Propagate DFS channel state to other wiphys * @wiphy - wiphy on which radar is detected and the event will be propagated -- cgit v1.2.3 From 4b7d4d453fc46769394e31d1cb19088f49897b59 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Thu, 16 May 2019 11:37:16 -0400 Subject: device_cgroup: Export devcgroup_check_permission For AMD compute (amdkfd) driver. All AMD compute devices are exported via single device node /dev/kfd. As a result devices cannot be controlled individually using device cgroup. AMD compute devices will rely on its graphics counterpart that exposes /dev/dri/renderN node for each device. For each task (based on its cgroup), KFD driver will check if /dev/dri/renderN node is accessible before exposing it. Signed-off-by: Harish Kasiviswanathan Acked-by: Tejun Heo Acked-by: Felix Kuehling Reviewed-by: Roman Gushchin Signed-off-by: Alex Deucher --- include/linux/device_cgroup.h | 19 ++++--------------- security/device_cgroup.c | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 8557efe096dc..fa35b52e0002 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -12,26 +12,15 @@ #define DEVCG_DEV_ALL 4 /* this represents all devices */ #ifdef CONFIG_CGROUP_DEVICE -extern int __devcgroup_check_permission(short type, u32 major, u32 minor, - short access); +int devcgroup_check_permission(short type, u32 major, u32 minor, + short access); #else -static inline int __devcgroup_check_permission(short type, u32 major, u32 minor, - short access) +static inline int devcgroup_check_permission(short type, u32 major, u32 minor, + short access) { return 0; } #endif #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) -static inline int devcgroup_check_permission(short type, u32 major, u32 minor, - short access) -{ - int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); - - if (rc) - return -EPERM; - - return __devcgroup_check_permission(type, major, minor, access); -} - static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 725674f3276d..7d0f8f7431ff 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -801,8 +801,8 @@ struct cgroup_subsys devices_cgrp_subsys = { * * returns 0 on success, -EPERM case the operation is not permitted */ -int __devcgroup_check_permission(short type, u32 major, u32 minor, - short access) +static int __devcgroup_check_permission(short type, u32 major, u32 minor, + short access) { struct dev_cgroup *dev_cgroup; bool rc; @@ -824,3 +824,14 @@ int __devcgroup_check_permission(short type, u32 major, u32 minor, return 0; } + +int devcgroup_check_permission(short type, u32 major, u32 minor, short access) +{ + int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); + + if (rc) + return -EPERM; + + return __devcgroup_check_permission(type, major, minor, access); +} +EXPORT_SYMBOL(devcgroup_check_permission); -- cgit v1.2.3 From 08d1d0e6d0a00c6e687201774f3bf61177741e80 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 6 Oct 2019 17:58:15 -0700 Subject: memcg: only record foreign writebacks with dirty pages when memcg is not disabled In kdump kernel, memcg usually is disabled with 'cgroup_disable=memory' for saving memory. Now kdump kernel will always panic when dump vmcore to local disk: BUG: kernel NULL pointer dereference, address: 0000000000000ab8 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 598 Comm: makedumpfile Not tainted 5.3.0+ #26 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 10/02/2018 RIP: 0010:mem_cgroup_track_foreign_dirty_slowpath+0x38/0x140 Call Trace: __set_page_dirty+0x52/0xc0 iomap_set_page_dirty+0x50/0x90 iomap_write_end+0x6e/0x270 iomap_write_actor+0xce/0x170 iomap_apply+0xba/0x11e iomap_file_buffered_write+0x62/0x90 xfs_file_buffered_aio_write+0xca/0x320 [xfs] new_sync_write+0x12d/0x1d0 vfs_write+0xa5/0x1a0 ksys_write+0x59/0xd0 do_syscall_64+0x59/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 And this will corrupt the 1st kernel too with 'cgroup_disable=memory'. Via the trace and with debugging, it is pointing to commit 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") which introduced this regression. Disabling memcg causes the null pointer dereference at uninitialized data in function mem_cgroup_track_foreign_dirty_slowpath(). Fix it by returning directly if memcg is disabled, but not trying to record the foreign writebacks with dirty pages. Link: http://lkml.kernel.org/r/20190924141928.GD31919@MiWiFi-R3L-srv Fixes: 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") Signed-off-by: Baoquan He Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9b60863429cc..98380779f6d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1264,6 +1264,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { + if (mem_cgroup_disabled()) + return; + if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } -- cgit v1.2.3 From 9783aa9917f8ae24759e67bf882f1aba32fe4ea1 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:32 -0700 Subject: mm, memcg: proportional memory.{low,min} reclaim cgroup v2 introduces two memory protection thresholds: memory.low (best-effort) and memory.min (hard protection). While they generally do what they say on the tin, there is a limitation in their implementation that makes them difficult to use effectively: that cliff behaviour often manifests when they become eligible for reclaim. This patch implements more intuitive and usable behaviour, where we gradually mount more reclaim pressure as cgroups further and further exceed their protection thresholds. This cliff edge behaviour happens because we only choose whether or not to reclaim based on whether the memcg is within its protection limits (see the use of mem_cgroup_protected in shrink_node), but we don't vary our reclaim behaviour based on this information. Imagine the following timeline, with the numbers the lruvec size in this zone: 1. memory.low=1000000, memory.current=999999. 0 pages may be scanned. 2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned. 3. memory.low=1000000, memory.current=1000001. 1000001* pages may be scanned. (?!) * Of course, we won't usually scan all available pages in the zone even without this patch because of scan control priority, over-reclaim protection, etc. However, as shown by the tests at the end, these techniques don't sufficiently throttle such an extreme change in input, so cliff-like behaviour isn't really averted by their existence alone. Here's an example of how this plays out in practice. At Facebook, we are trying to protect various workloads from "system" software, like configuration management tools, metric collectors, etc (see this[0] case study). In order to find a suitable memory.low value, we start by determining the expected memory range within which the workload will be comfortable operating. This isn't an exact science -- memory usage deemed "comfortable" will vary over time due to user behaviour, differences in composition of work, etc, etc. As such we need to ballpark memory.low, but doing this is currently problematic: 1. If we end up setting it too low for the workload, it won't have *any* effect (see discussion above). The group will receive the full weight of reclaim and won't have any priority while competing with the less important system software, as if we had no memory.low configured at all. 2. Because of this behaviour, we end up erring on the side of setting it too high, such that the comfort range is reliably covered. However, protected memory is completely unavailable to the rest of the system, so we might cause undue memory and IO pressure there when we *know* we have some elasticity in the workload. 3. Even if we get the value totally right, smack in the middle of the comfort zone, we get extreme jumps between no pressure and full pressure that cause unpredictable pressure spikes in the workload due to the current binary reclaim behaviour. With this patch, we can set it to our ballpark estimation without too much worry. Any undesirable behaviour, such as too much or too little reclaim pressure on the workload or system will be proportional to how far our estimation is off. This means we can set memory.low much more conservatively and thus waste less resources *without* the risk of the workload falling off a cliff if we overshoot. As a more abstract technical description, this unintuitive behaviour results in having to give high-priority workloads a large protection buffer on top of their expected usage to function reliably, as otherwise we have abrupt periods of dramatically increased memory pressure which hamper performance. Having to set these thresholds so high wastes resources and generally works against the principle of work conservation. In addition, having proportional memory reclaim behaviour has other benefits. Most notably, before this patch it's basically mandatory to set memory.low to a higher than desirable value because otherwise as soon as you exceed memory.low, all protection is lost, and all pages are eligible to scan again. By contrast, having a gradual ramp in reclaim pressure means that you now still get some protection when thresholds are exceeded, which means that one can now be more comfortable setting memory.low to lower values without worrying that all protection will be lost. This is important because workingset size is really hard to know exactly, especially with variable workloads, so at least getting *some* protection if your workingset size grows larger than you expect increases user confidence in setting memory.low without a huge buffer on top being needed. Thanks a lot to Johannes Weiner and Tejun Heo for their advice and assistance in thinking about how to make this work better. In testing these changes, I intended to verify that: 1. Changes in page scanning become gradual and proportional instead of binary. To test this, I experimented stepping further and further down memory.low protection on a workload that floats around 19G workingset when under memory.low protection, watching page scan rates for the workload cgroup: +------------+-----------------+--------------------+--------------+ | memory.low | test (pgscan/s) | control (pgscan/s) | % of control | +------------+-----------------+--------------------+--------------+ | 21G | 0 | 0 | N/A | | 17G | 867 | 3799 | 23% | | 12G | 1203 | 3543 | 34% | | 8G | 2534 | 3979 | 64% | | 4G | 3980 | 4147 | 96% | | 0 | 3799 | 3980 | 95% | +------------+-----------------+--------------------+--------------+ As you can see, the test kernel (with a kernel containing this patch) ramps up page scanning significantly more gradually than the control kernel (without this patch). 2. More gradual ramp up in reclaim aggression doesn't result in premature OOMs. To test this, I wrote a script that slowly increments the number of pages held by stress(1)'s --vm-keep mode until a production system entered severe overall memory contention. This script runs in a highly protected slice taking up the majority of available system memory. Watching vmstat revealed that page scanning continued essentially nominally between test and control, without causing forward reclaim progress to become arrested. [0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project [akpm@linux-foundation.org: reflow block comments to fit in 80 cols] [chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection] Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 20 +++++--- include/linux/memcontrol.h | 20 ++++++++ mm/memcontrol.c | 5 ++ mm/vmscan.c | 82 ++++++++++++++++++++++++++++++--- 4 files changed, 115 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0fa8c0e615c2..5361ebec3361 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -615,8 +615,8 @@ on an IO device and is an example of this type. Protections ----------- -A cgroup is protected to be allocated upto the configured amount of -the resource if the usages of all its ancestors are under their +A cgroup is protected upto the configured amount of the resource +as long as the usages of all its ancestors are under their protected levels. Protections can be hard guarantees or best effort soft boundaries. Protections can also be over-committed in which case only upto the amount available to the parent is protected among @@ -1096,7 +1096,10 @@ PAGE_SIZE multiple when read back. is within its effective min boundary, the cgroup's memory won't be reclaimed under any conditions. If there is no unprotected reclaimable memory available, OOM killer - is invoked. + is invoked. Above the effective min boundary (or + effective low boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective min boundary is limited by memory.min values of all ancestor cgroups. If there is memory.min overcommitment @@ -1118,7 +1121,10 @@ PAGE_SIZE multiple when read back. Best-effort memory protection. If the memory usage of a cgroup is within its effective low boundary, the cgroup's memory won't be reclaimed unless memory can be reclaimed - from unprotected cgroups. + from unprotected cgroups. Above the effective low boundary (or + effective min boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment @@ -2482,8 +2488,10 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it's within its low, -which makes delegation of subtrees possible. +reserve. A cgroup enjoys reclaim protection when it's within its +effective low, which makes delegation of subtrees possible. It also +enjoys having reclaim pressure proportional to its overage when +above its effective low. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 98380779f6d5..fa9ba2edf7e0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,6 +356,14 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return 0; + + return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); +} + enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -537,6 +545,8 @@ void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); +unsigned long mem_cgroup_size(struct mem_cgroup *memcg); + void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -829,6 +839,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + return 0; +} + static inline enum mem_cgroup_protection mem_cgroup_protected( struct mem_cgroup *root, struct mem_cgroup *memcg) { @@ -968,6 +983,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } +static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return 0; +} + static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c313c49074ca..bdac56009a38 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { diff --git a/mm/vmscan.c b/mm/vmscan.c index e5d52d6a24af..dfefa1d99d1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2459,17 +2459,80 @@ out: *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg); + + if (protection > 0) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long baseline = 0; + + /* + * During the reclaim first pass, we only consider + * cgroups in excess of their protection setting, but if + * that doesn't produce free pages, we come back for a + * second pass where we reclaim from all groups. + * + * To maintain fairness in both cases, the first pass + * targets groups in proportion to their overage, and + * the second pass targets groups in proportion to their + * protection utilization. + * + * So on the first pass, a group whose size is 130% of + * its protection will be targeted at 30% of its size. + * On the second pass, a group whose size is at 40% of + * its protection will be + * targeted at 40% of its size. + */ + if (!sc->memcg_low_reclaim) + baseline = lruvec_size; + scan = lruvec_size * cgroup_size / protection - baseline; + + /* + * Don't allow the scan target to exceed the lruvec + * size, which otherwise could happen if we have >200% + * overage in the normal case, or >100% overage when + * sc->memcg_low_reclaim is set. + * + * This is important because other cgroups without + * memory.low have their scan target initially set to + * their lruvec size, so allowing values >100% of the + * lruvec size here could result in penalising cgroups + * with memory.low set even *more* than their peers in + * some cases in the case of large overages. + * + * Also, minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards. + */ + scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2489,7 +2552,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2498,7 +2561,7 @@ out: BUG(); } - *lru_pages += size; + *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2742,6 +2805,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } -- cgit v1.2.3 From 9de7ca46ad2688bd51e80f7119fefa301ad7f3fa Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:35 -0700 Subject: mm, memcg: make memory.emin the baseline for utilisation determination Roman points out that when when we do the low reclaim pass, we scale the reclaim pressure relative to position between 0 and the maximum protection threshold. However, if the maximum protection is based on memory.elow, and memory.emin is above zero, this means we still may get binary behaviour on second-pass low reclaim. This is because we scale starting at 0, not starting at memory.emin, and since we don't scan at all below emin, we end up with cliff behaviour. This should be a fairly uncommon case since usually we don't go into the second pass, but it makes sense to scale our low reclaim pressure starting at emin. You can test this by catting two large sparse files, one in a cgroup with emin set to some moderate size compared to physical RAM, and another cgroup without any emin. In both cgroups, set an elow larger than 50% of physical RAM. The one with emin will have less page scanning, as reclaim pressure is lower. Rebase on top of and apply the same idea as what was applied to handle cgroup_memory=disable properly for the original proportional patch http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name ("mm, memcg: Handle cgroup_disable=memory when getting memcg protection"). Link: http://lkml.kernel.org/r/20190201051810.GA18895@chrisdown.name Signed-off-by: Chris Down Suggested-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++----- mm/vmscan.c | 55 +++++++++++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa9ba2edf7e0..1cbad1248e5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,12 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - if (mem_cgroup_disabled()) - return 0; + if (mem_cgroup_disabled()) { + *min = 0; + *low = 0; + return; + } - return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); + *min = READ_ONCE(memcg->memory.emin); + *low = READ_ONCE(memcg->memory.elow); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -839,9 +844,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - return 0; + *min = 0; + *low = 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( diff --git a/mm/vmscan.c b/mm/vmscan.c index dfefa1d99d1b..70347d626fb3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2461,12 +2461,12 @@ out: int file = is_file_lru(lru); unsigned long lruvec_size; unsigned long scan; - unsigned long protection; + unsigned long min, low; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - protection = mem_cgroup_protection(memcg); + mem_cgroup_protection(memcg, &min, &low); - if (protection > 0) { + if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2481,28 +2481,38 @@ out: * set it too low, which is not ideal. */ unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long baseline = 0; /* - * During the reclaim first pass, we only consider - * cgroups in excess of their protection setting, but if - * that doesn't produce free pages, we come back for a - * second pass where we reclaim from all groups. + * If there is any protection in place, we adjust scan + * pressure in proportion to how much a group's current + * usage exceeds that, in percent. * - * To maintain fairness in both cases, the first pass - * targets groups in proportion to their overage, and - * the second pass targets groups in proportion to their - * protection utilization. - * - * So on the first pass, a group whose size is 130% of - * its protection will be targeted at 30% of its size. - * On the second pass, a group whose size is at 40% of - * its protection will be - * targeted at 40% of its size. + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * of their best-effort protection they are using. Usage + * below memory.min is excluded from consideration when + * calculating utilisation, as it isn't ever + * reclaimable, so it might as well not exist for our + * purposes. */ - if (!sc->memcg_low_reclaim) - baseline = lruvec_size; - scan = lruvec_size * cgroup_size / protection - baseline; + if (sc->memcg_low_reclaim && low > min) { + /* + * Reclaim according to utilisation between min + * and low + */ + scan = lruvec_size * (cgroup_size - min) / + (low - min); + } else { + /* Reclaim according to protection overage */ + scan = lruvec_size * cgroup_size / + max(min, low) - lruvec_size; + } /* * Don't allow the scan target to exceed the lruvec @@ -2518,7 +2528,8 @@ out: * some cases in the case of large overages. * * Also, minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards. + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. */ scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); } else { -- cgit v1.2.3 From 1bc63fb1272be0773e925f78c0fbd06c89701d55 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:38 -0700 Subject: mm, memcg: make scan aggression always exclude protection This patch is an incremental improvement on the existing memory.{low,min} relative reclaim work to base its scan pressure calculations on how much protection is available compared to the current usage, rather than how much the current usage is over some protection threshold. This change doesn't change the experience for the user in the normal case too much. One benefit is that it replaces the (somewhat arbitrary) 100% cutoff with an indefinite slope, which makes it easier to ballpark a memory.low value. As well as this, the old methodology doesn't quite apply generically to machines with varying amounts of physical memory. Let's say we have a top level cgroup, workload.slice, and another top level cgroup, system-management.slice. We want to roughly give 12G to system-management.slice, so on a 32GB machine we set memory.low to 20GB in workload.slice, and on a 64GB machine we set memory.low to 52GB. However, because these are relative amounts to the total machine size, while the amount of memory we want to generally be willing to yield to system.slice is absolute (12G), we end up putting more pressure on system.slice just because we have a larger machine and a larger workload to fill it, which seems fairly unintuitive. With this new behaviour, we don't end up with this unintended side effect. Previously the way that memory.low protection works is that if you are 50% over a certain baseline, you get 50% of your normal scan pressure. This is certainly better than the previous cliff-edge behaviour, but it can be improved even further by always considering memory under the currently enforced protection threshold to be out of bounds. This means that we can set relatively low memory.low thresholds for variable or bursty workloads while still getting a reasonable level of protection, whereas with the previous version we may still trivially hit the 100% clamp. The previous 100% clamp is also somewhat arbitrary, whereas this one is more concretely based on the currently enforced protection threshold, which is likely easier to reason about. There is also a subtle issue with the way that proportional reclaim worked previously -- it promotes having no memory.low, since it makes pressure higher during low reclaim. This happens because we base our scan pressure modulation on how far memory.current is between memory.min and memory.low, but if memory.low is unset, we only use the overage method. In most cromulent configurations, this then means that we end up with *more* pressure than with no memory.low at all when we're in low reclaim, which is not really very usable or expected. With this patch, memory.low and memory.min affect reclaim pressure in a more understandable and composable way. For example, from a user standpoint, "protected" memory now remains untouchable from a reclaim aggression standpoint, and users can also have more confidence that bursty workloads will still receive some amount of guaranteed protection. Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name Signed-off-by: Chris Down Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 +++++++++---------- mm/vmscan.c | 61 +++++++++++++++------------------------------- 2 files changed, 32 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1cbad1248e5a..ae703ea3ef48 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,17 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_protection(struct mem_cgroup *memcg, - unsigned long *min, unsigned long *low) +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) { - if (mem_cgroup_disabled()) { - *min = 0; - *low = 0; - return; - } + if (mem_cgroup_disabled()) + return 0; + + if (in_low_reclaim) + return READ_ONCE(memcg->memory.emin); - *min = READ_ONCE(memcg->memory.emin); - *low = READ_ONCE(memcg->memory.elow); + return max(READ_ONCE(memcg->memory.emin), + READ_ONCE(memcg->memory.elow)); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -844,11 +844,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline void mem_cgroup_protection(struct mem_cgroup *memcg, - unsigned long *min, unsigned long *low) +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) { - *min = 0; - *low = 0; + return 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( diff --git a/mm/vmscan.c b/mm/vmscan.c index 70347d626fb3..c6659bb758a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2461,12 +2461,13 @@ out: int file = is_file_lru(lru); unsigned long lruvec_size; unsigned long scan; - unsigned long min, low; + unsigned long protection; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - mem_cgroup_protection(memcg, &min, &low); + protection = mem_cgroup_protection(memcg, + sc->memcg_low_reclaim); - if (min || low) { + if (protection) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2479,13 +2480,10 @@ out: * setting extremely liberal protection thresholds. It * also means we simply get no protection at all if we * set it too low, which is not ideal. - */ - unsigned long cgroup_size = mem_cgroup_size(memcg); - - /* - * If there is any protection in place, we adjust scan - * pressure in proportion to how much a group's current - * usage exceeds that, in percent. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. * * There is one special case: in the first reclaim pass, * we skip over all groups that are within their low @@ -2495,43 +2493,24 @@ out: * ideally want to honor how well-behaved groups are in * that case instead of simply punishing them all * equally. As such, we reclaim them based on how much - * of their best-effort protection they are using. Usage - * below memory.min is excluded from consideration when - * calculating utilisation, as it isn't ever - * reclaimable, so it might as well not exist for our - * purposes. + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. */ - if (sc->memcg_low_reclaim && low > min) { - /* - * Reclaim according to utilisation between min - * and low - */ - scan = lruvec_size * (cgroup_size - min) / - (low - min); - } else { - /* Reclaim according to protection overage */ - scan = lruvec_size * cgroup_size / - max(min, low) - lruvec_size; - } + unsigned long cgroup_size = mem_cgroup_size(memcg); + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + cgroup_size; /* - * Don't allow the scan target to exceed the lruvec - * size, which otherwise could happen if we have >200% - * overage in the normal case, or >100% overage when - * sc->memcg_low_reclaim is set. - * - * This is important because other cgroups without - * memory.low have their scan target initially set to - * their lruvec size, so allowing values >100% of the - * lruvec size here could result in penalising cgroups - * with memory.low set even *more* than their peers in - * some cases in the case of large overages. - * - * Also, minimally target SWAP_CLUSTER_MAX pages to keep + * Minimally target SWAP_CLUSTER_MAX pages to keep * reclaim moving forwards, avoiding decremeting * sc->priority further than desirable. */ - scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); + scan = max(scan, SWAP_CLUSTER_MAX); } else { scan = lruvec_size; } -- cgit v1.2.3 From 59bb47985c1db229ccff8c5deebecd54fc77d2a9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 6 Oct 2019 17:58:45 -0700 Subject: mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two) In most configurations, kmalloc() happens to return naturally aligned (i.e. aligned to the block size itself) blocks for power of two sizes. That means some kmalloc() users might unknowingly rely on that alignment, until stuff breaks when the kernel is built with e.g. CONFIG_SLUB_DEBUG or CONFIG_SLOB, and blocks stop being aligned. Then developers have to devise workaround such as own kmem caches with specified alignment [1], which is not always practical, as recently evidenced in [2]. The topic has been discussed at LSF/MM 2019 [3]. Adding a 'kmalloc_aligned()' variant would not help with code unknowingly relying on the implicit alignment. For slab implementations it would either require creating more kmalloc caches, or allocate a larger size and only give back part of it. That would be wasteful, especially with a generic alignment parameter (in contrast with a fixed alignment to size). Ideally we should provide to mm users what they need without difficult workarounds or own reimplementations, so let's make the kmalloc() alignment to size explicitly guaranteed for power-of-two sizes under all configurations. What this means for the three available allocators? * SLAB object layout happens to be mostly unchanged by the patch. The implicitly provided alignment could be compromised with CONFIG_DEBUG_SLAB due to redzoning, however SLAB disables redzoning for caches with alignment larger than unsigned long long. Practically on at least x86 this includes kmalloc caches as they use cache line alignment, which is larger than that. Still, this patch ensures alignment on all arches and cache sizes. * SLUB layout is also unchanged unless redzoning is enabled through CONFIG_SLUB_DEBUG and boot parameter for the particular kmalloc cache. With this patch, explicit alignment is guaranteed with redzoning as well. This will result in more memory being wasted, but that should be acceptable in a debugging scenario. * SLOB has no implicit alignment so this patch adds it explicitly for kmalloc(). The potential downside is increased fragmentation. While pathological allocation scenarios are certainly possible, in my testing, after booting a x86_64 kernel+userspace with virtme, around 16MB memory was consumed by slab pages both before and after the patch, with difference in the noise. [1] https://lore.kernel.org/linux-btrfs/c3157c8e8e0e7588312b40c853f65c02fe6c957a.1566399731.git.christophe.leroy@c-s.fr/ [2] https://lore.kernel.org/linux-fsdevel/20190225040904.5557-1-ming.lei@redhat.com/ [3] https://lwn.net/Articles/787740/ [akpm@linux-foundation.org: documentation fixlet, per Matthew] Link: http://lkml.kernel.org/r/20190826111627.7505-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Christoph Hellwig Cc: David Sterba Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Ming Lei Cc: Dave Chinner Cc: "Darrick J . Wong" Cc: Christoph Hellwig Cc: James Bottomley Cc: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/memory-allocation.rst | 4 +++ include/linux/slab.h | 4 +++ mm/slab_common.c | 11 +++++++- mm/slob.c | 42 ++++++++++++++++++++-------- 4 files changed, 49 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst index 7744aa3bf2e0..939e3dfc86e9 100644 --- a/Documentation/core-api/memory-allocation.rst +++ b/Documentation/core-api/memory-allocation.rst @@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel configuration, but it is a good practice to use `kmalloc` for objects smaller than page size. +The address of a chunk allocated with `kmalloc` is aligned to at least +ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the +alignment is also guaranteed to be at least the respective size. + For large allocations you can use :c:func:`vmalloc` and :c:func:`vzalloc`, or directly request pages from the page allocator. The memory allocated by `vmalloc` and related functions is diff --git a/include/linux/slab.h b/include/linux/slab.h index ab2b98ad76e1..4d2a2fa55ed5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. * + * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN + * bytes. For @size of power of two bytes, the alignment is also guaranteed + * to be at least to the size. + * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp.h and described at * :ref:`Documentation/core-api/mm-api.rst ` diff --git a/mm/slab_common.c b/mm/slab_common.c index 0a94cf858aa4..c29f03adca91 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1030,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, unsigned int useroffset, unsigned int usersize) { int err; + unsigned int align = ARCH_KMALLOC_MINALIGN; s->name = name; s->size = s->object_size = size; - s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + /* + * For power of two sizes, guarantee natural alignment for kmalloc + * caches, regardless of SL*B debugging options. + */ + if (is_power_of_2(size)) + align = max(align, size); + s->align = calculate_alignment(flags, align, size); + s->useroffset = useroffset; s->usersize = usersize; diff --git a/mm/slob.c b/mm/slob.c index 835088d55645..fa53e9f73893 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -224,6 +224,7 @@ static void slob_free_pages(void *b, int order) * @sp: Page to look in. * @size: Size of the allocation. * @align: Allocation alignment. + * @align_offset: Offset in the allocated block that will be aligned. * @page_removed_from_list: Return parameter. * * Tries to find a chunk of memory at least @size bytes big within @page. @@ -234,7 +235,7 @@ static void slob_free_pages(void *b, int order) * true (set to false otherwise). */ static void *slob_page_alloc(struct page *sp, size_t size, int align, - bool *page_removed_from_list) + int align_offset, bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); @@ -243,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); + /* + * 'aligned' will hold the address of the slob block so that the + * address 'aligned'+'align_offset' is aligned according to the + * 'align' parameter. This is for kmalloc() which prepends the + * allocated block with its size, so that the block itself is + * aligned when needed. + */ if (align) { - aligned = (slob_t *)ALIGN((unsigned long)cur, align); + aligned = (slob_t *) + (ALIGN((unsigned long)cur + align_offset, align) + - align_offset); delta = aligned - cur; } if (avail >= units + delta) { /* room enough? */ @@ -288,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, /* * slob_alloc: entry point into the slob allocator. */ -static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, + int align_offset) { struct page *sp; struct list_head *slob_list; @@ -319,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - b = slob_page_alloc(sp, size, align, &page_removed_from_list); + b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); if (!b) continue; @@ -356,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align, &_unused); + b = slob_page_alloc(sp, size, align, align_offset, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } @@ -458,7 +469,7 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -466,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) fs_reclaim_acquire(gfp); fs_reclaim_release(gfp); - if (size < PAGE_SIZE - align) { + if (size < PAGE_SIZE - minalign) { + int align = minalign; + + /* + * For power of two sizes, guarantee natural alignment for + * kmalloc()'d objects. + */ + if (is_power_of_2(size)) + align = max(minalign, (int) size); + if (!size) return ZERO_SIZE_PTR; - m = slob_alloc(size + align, gfp, align, node); + m = slob_alloc(size + minalign, gfp, align, node, minalign); if (!m) return NULL; *m = size; - ret = (void *)m + align; + ret = (void *)m + minalign; trace_kmalloc_node(caller, ret, - size, size + align, gfp, node); + size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -579,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { - b = slob_alloc(c->size, flags, c->align, node); + b = slob_alloc(c->size, flags, c->align, node, 0); trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); -- cgit v1.2.3 From bec500777089b3c96c53681fc0aa6fee59711d4a Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 7 Oct 2019 18:00:02 -0400 Subject: lib/string: Make memzero_explicit() inline instead of external With the use of the barrier implied by barrier_data(), there is no need for memzero_explicit() to be extern. Making it inline saves the overhead of a function call, and allows the code to be reused in arch/*/purgatory without having to duplicate the implementation. Tested-by: Hans de Goede Signed-off-by: Arvind Sankar Reviewed-by: Hans de Goede Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: H . Peter Anvin Cc: Herbert Xu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephan Mueller Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Cc: linux-s390@vger.kernel.org Fixes: 906a4bb97f5d ("crypto: sha256 - Use get/put_unaligned_be32 to get input, memzero_explicit") Link: https://lkml.kernel.org/r/20191007220000.GA408752@rani.riverdale.lan Signed-off-by: Ingo Molnar --- include/linux/string.h | 21 ++++++++++++++++++++- lib/string.c | 21 --------------------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index b2f9df7f0761..b6ccdc2c7f02 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -227,7 +227,26 @@ static inline bool strstarts(const char *str, const char *prefix) } size_t memweight(const void *ptr, size_t bytes); -void memzero_explicit(void *s, size_t count); + +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * Note: usually using memset() is just fine (!), but in cases + * where clearing out _local_ data at the end of a scope is + * necessary, memzero_explicit() should be used instead in + * order to prevent the compiler from optimising away zeroing. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +static inline void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + barrier_data(s); +} /** * kbasename - return the last part of a pathname. diff --git a/lib/string.c b/lib/string.c index cd7a10c19210..08ec58cc673b 100644 --- a/lib/string.c +++ b/lib/string.c @@ -748,27 +748,6 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif -/** - * memzero_explicit - Fill a region of memory (e.g. sensitive - * keying data) with 0s. - * @s: Pointer to the start of the area. - * @count: The size of the area. - * - * Note: usually using memset() is just fine (!), but in cases - * where clearing out _local_ data at the end of a scope is - * necessary, memzero_explicit() should be used instead in - * order to prevent the compiler from optimising away zeroing. - * - * memzero_explicit() doesn't need an arch-specific version as - * it just invokes the one of memset() implicitly. - */ -void memzero_explicit(void *s, size_t count) -{ - memset(s, 0, count); - barrier_data(s); -} -EXPORT_SYMBOL(memzero_explicit); - #ifndef __HAVE_ARCH_MEMSET16 /** * memset16() - Fill a memory area with a uint16_t -- cgit v1.2.3 From 4e98f871bcffa322850c73d22c66bbd7af2a0374 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Mon, 7 Oct 2019 19:12:24 +0200 Subject: drm: delete drmP.h + drm_os_linux.h There is finally no more users left in the kernel of drmP.h and drm_os_linux.h (drmP.h was the only user left). Delete the header files and delete the corresponding todo entry. When we started this quest there was more than 700 users of drmP.h. And drmP.h was a huge cover-it-all header file. Daniel Vetter is the one that followed the work from start to the end and in between many people have contributed to the removal process - thanks to everyone! Signed-off-by: Sam Ravnborg Reviewed-by: Sean Paul Reviewed-by: Lyude Paul Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Sean Paul Cc: David Airlie Cc: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20191007171224.1581-3-sam@ravnborg.org --- Documentation/gpu/todo.rst | 12 ------ include/drm/drmP.h | 103 --------------------------------------------- include/drm/drm_os_linux.h | 55 ------------------------ 3 files changed, 170 deletions(-) delete mode 100644 include/drm/drmP.h delete mode 100644 include/drm/drm_os_linux.h (limited to 'include') diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index 8dc147c93c9c..79785559d711 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -299,18 +299,6 @@ connector register/unregister fixes Core refactorings ================= -Clean up the DRM header mess ----------------------------- - -The DRM subsystem originally had only one huge global header, ``drmP.h``. This -is now split up, but many source files still include it. The remaining part of -the cleanup work here is to replace any ``#include `` by only the -headers needed (and fixing up any missing pre-declarations in the headers). - -In the end no .c file should need to include ``drmP.h`` anymore. - -Contact: Daniel Vetter - Make panic handling work ------------------------ diff --git a/include/drm/drmP.h b/include/drm/drmP.h deleted file mode 100644 index 037b1f7a87a5..000000000000 --- a/include/drm/drmP.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Internal Header for the Direct Rendering Manager - * - * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. - * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. - * Copyright (c) 2009-2010, Code Aurora Forum. - * All rights reserved. - * - * Author: Rickard E. (Rik) Faith - * Author: Gareth Hughes - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef _DRM_P_H_ -#define _DRM_P_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct module; - -struct device_node; -struct videomode; -struct dma_resv; -struct dma_buf_attachment; - -struct pci_dev; -struct pci_controller; - -/* - * NOTE: drmP.h is obsolete - do NOT add anything to this file - * - * Do not include drmP.h in new files. - * Work is ongoing to remove drmP.h includes from existing files - */ - -#endif diff --git a/include/drm/drm_os_linux.h b/include/drm/drm_os_linux.h deleted file mode 100644 index ee8d61b64f29..000000000000 --- a/include/drm/drm_os_linux.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/** - * \file drm_os_linux.h - * OS abstraction macros. - */ - -#include /* For task queue support */ -#include -#include -#include - -/** Current process ID */ -#define DRM_CURRENTPID task_pid_nr(current) -#define DRM_UDELAY(d) udelay(d) -/** Read a byte from a MMIO region */ -#define DRM_READ8(map, offset) readb(((void __iomem *)(map)->handle) + (offset)) -/** Read a word from a MMIO region */ -#define DRM_READ16(map, offset) readw(((void __iomem *)(map)->handle) + (offset)) -/** Read a dword from a MMIO region */ -#define DRM_READ32(map, offset) readl(((void __iomem *)(map)->handle) + (offset)) -/** Write a byte into a MMIO region */ -#define DRM_WRITE8(map, offset, val) writeb(val, ((void __iomem *)(map)->handle) + (offset)) -/** Write a word into a MMIO region */ -#define DRM_WRITE16(map, offset, val) writew(val, ((void __iomem *)(map)->handle) + (offset)) -/** Write a dword into a MMIO region */ -#define DRM_WRITE32(map, offset, val) writel(val, ((void __iomem *)(map)->handle) + (offset)) - -/** Read a qword from a MMIO region - be careful using these unless you really understand them */ -#define DRM_READ64(map, offset) readq(((void __iomem *)(map)->handle) + (offset)) -/** Write a qword into a MMIO region */ -#define DRM_WRITE64(map, offset, val) writeq(val, ((void __iomem *)(map)->handle) + (offset)) - -#define DRM_WAIT_ON( ret, queue, timeout, condition ) \ -do { \ - DECLARE_WAITQUEUE(entry, current); \ - unsigned long end = jiffies + (timeout); \ - add_wait_queue(&(queue), &entry); \ - \ - for (;;) { \ - __set_current_state(TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (time_after_eq(jiffies, end)) { \ - ret = -EBUSY; \ - break; \ - } \ - schedule_timeout((HZ/100 > 1) ? HZ/100 : 1); \ - if (signal_pending(current)) { \ - ret = -EINTR; \ - break; \ - } \ - } \ - __set_current_state(TASK_RUNNING); \ - remove_wait_queue(&(queue), &entry); \ -} while (0) -- cgit v1.2.3 From e3f1271474182682638654021123b94e6ec1626b Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 2 Oct 2019 07:40:42 -0500 Subject: leds: core: Fix leds.h structure documentation Update the leds.h structure documentation to define the correct arguments. Signed-off-by: Dan Murphy Signed-off-by: Jacek Anaszewski --- include/linux/leds.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/leds.h b/include/linux/leds.h index b8df71193329..efb309dba914 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -247,7 +247,7 @@ extern void led_set_brightness(struct led_classdev *led_cdev, /** * led_set_brightness_sync - set LED brightness synchronously * @led_cdev: the LED to set - * @brightness: the brightness to set it to + * @value: the brightness to set it to * * Set an LED's brightness immediately. This function will block * the caller for the time required for accessing device registers, @@ -301,8 +301,7 @@ extern void led_sysfs_enable(struct led_classdev *led_cdev); /** * led_compose_name - compose LED class device name * @dev: LED controller device object - * @child: child fwnode_handle describing a LED or a group of synchronized LEDs; - * it must be provided only for fwnode based LEDs + * @init_data: the LED class device initialization data * @led_classdev_name: composed LED class device name * * Create LED class device name basing on the provided init_data argument. -- cgit v1.2.3 From b74555de21acd791f12c4a1aeaf653dd7ac21133 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Oct 2019 14:24:25 -0700 Subject: llc: fix sk_buff leak in llc_conn_service() syzbot reported: BUG: memory leak unreferenced object 0xffff88811eb3de00 (size 224): comm "syz-executor559", pid 7315, jiffies 4294943019 (age 10.300s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 a0 38 24 81 88 ff ff 00 c0 f2 15 81 88 ff ff ..8$............ backtrace: [<000000008d1c66a1>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000008d1c66a1>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000008d1c66a1>] slab_alloc_node mm/slab.c:3269 [inline] [<000000008d1c66a1>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579 [<00000000447d9496>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198 [<000000000cdbf82f>] alloc_skb include/linux/skbuff.h:1058 [inline] [<000000000cdbf82f>] llc_alloc_frame+0x66/0x110 net/llc/llc_sap.c:54 [<000000002418b52e>] llc_conn_ac_send_sabme_cmd_p_set_x+0x2f/0x140 net/llc/llc_c_ac.c:777 [<000000001372ae17>] llc_exec_conn_trans_actions net/llc/llc_conn.c:475 [inline] [<000000001372ae17>] llc_conn_service net/llc/llc_conn.c:400 [inline] [<000000001372ae17>] llc_conn_state_process+0x1ac/0x640 net/llc/llc_conn.c:75 [<00000000f27e53c1>] llc_establish_connection+0x110/0x170 net/llc/llc_if.c:109 [<00000000291b2ca0>] llc_ui_connect+0x10e/0x370 net/llc/af_llc.c:477 [<000000000f9c740b>] __sys_connect+0x11d/0x170 net/socket.c:1840 [...] The bug is that most callers of llc_conn_send_pdu() assume it consumes a reference to the skb, when actually due to commit b85ab56c3f81 ("llc: properly handle dev_queue_xmit() return value") it doesn't. Revert most of that commit, and instead make the few places that need llc_conn_send_pdu() to *not* consume a reference call skb_get() before. Fixes: b85ab56c3f81 ("llc: properly handle dev_queue_xmit() return value") Reported-by: syzbot+6b825a6494a04cc0e3f7@syzkaller.appspotmail.com Signed-off-by: Eric Biggers Signed-off-by: Jakub Kicinski --- include/net/llc_conn.h | 2 +- net/llc/llc_c_ac.c | 8 ++++++-- net/llc/llc_conn.c | 32 +++++++++----------------------- 3 files changed, 16 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index df528a623548..ea985aa7a6c5 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -104,7 +104,7 @@ void llc_sk_reset(struct sock *sk); /* Access to a connection */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb); -int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 4d78375f9872..647c0554d04c 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -372,6 +372,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -389,7 +390,8 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -406,6 +408,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -916,7 +919,8 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 4ff89cb7c86f..ed2aca12460c 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); +static void llc_conn_send_pdus(struct sock *sk); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -193,11 +193,11 @@ out_skb_put: return rc; } -int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - return llc_conn_send_pdus(sk, skb); + llc_conn_send_pdus(sk); } /** @@ -255,7 +255,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -296,7 +296,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -340,16 +340,12 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection - * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. When @hold_skb is - * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent - * successfully, or 1 for failure. + * Sends queued pdus to MAC layer for transmission. */ -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) +static void llc_conn_send_pdus(struct sock *sk) { struct sk_buff *skb; - int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -361,20 +357,10 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - dev_queue_xmit(skb2); - } else { - bool is_target = skb == hold_skb; - int rc; - - if (is_target) - skb_get(skb); - rc = dev_queue_xmit(skb); - if (is_target) - ret = rc; + skb = skb2; } + dev_queue_xmit(skb); } - - return ret; } /** -- cgit v1.2.3 From 55fd0e206eaf5056885a9055b207143ab7c8fcd1 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 9 Sep 2019 21:21:47 +0000 Subject: drm: Add link training repeaters addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DP 1.3 specification introduces the Link Training-tunable PHY Repeater, and DP 1.4* supplemented it with new features. In the 1.4a spec, it was introduced some innovations to make handy to add support for systems with Thunderbolt or other repeater devices. It is important to highlight that DP specification had some updates from 1.3 through 1.4a. In particular, DP 1.4 defines Repeater_FEC_CAPABILITY at the address 0xf0004, and DP 1.4a redefined the address 0xf0004 to DP_MAX_LANE_COUNT_PHY_REPEATER. Changes since V4: - Update commit message - Fix misleading comments related to the spec version Changes since V3: - Replace spaces by tabs Changes since V2: - Drop the kernel-doc comment - Reorder LTTPR according to register offset Changes since V1: - Adjusts registers names to be aligned with spec and the rest of the file - Update spec comment from 1.4 to 1.4a Cc: Abdoulaye Berthe Cc: Harry Wentland Cc: Leo Li Cc: Jani Nikula Cc: Manasi Navare Cc: Ville Syrjälä Signed-off-by: Rodrigo Siqueira Signed-off-by: Abdoulaye Berthe Reviewed-by: Harry Wentland Signed-off-by: Rodrigo Siqueira Link: https://patchwork.freedesktop.org/patch/msgid/20190909212144.deeomlsqihwg4l3y@outlook.office365.com --- include/drm/drm_dp_helper.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h index ed1a985745ba..bf62b43aaf2b 100644 --- a/include/drm/drm_dp_helper.h +++ b/include/drm/drm_dp_helper.h @@ -1008,6 +1008,32 @@ #define DP_HDCP_2_2_REG_STREAM_TYPE_OFFSET 0x69494 #define DP_HDCP_2_2_REG_DBG_OFFSET 0x69518 +/* Link Training (LT)-tunable PHY Repeaters */ +#define DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV 0xf0000 /* 1.3 */ +#define DP_MAX_LINK_RATE_PHY_REPEATER 0xf0001 /* 1.4a */ +#define DP_PHY_REPEATER_CNT 0xf0002 /* 1.3 */ +#define DP_PHY_REPEATER_MODE 0xf0003 /* 1.3 */ +#define DP_MAX_LANE_COUNT_PHY_REPEATER 0xf0004 /* 1.4a */ +#define DP_Repeater_FEC_CAPABILITY 0xf0004 /* 1.4 */ +#define DP_PHY_REPEATER_EXTENDED_WAIT_TIMEOUT 0xf0005 /* 1.4a */ +#define DP_TRAINING_PATTERN_SET_PHY_REPEATER1 0xf0010 /* 1.3 */ +#define DP_TRAINING_LANE0_SET_PHY_REPEATER1 0xf0011 /* 1.3 */ +#define DP_TRAINING_LANE1_SET_PHY_REPEATER1 0xf0012 /* 1.3 */ +#define DP_TRAINING_LANE2_SET_PHY_REPEATER1 0xf0013 /* 1.3 */ +#define DP_TRAINING_LANE3_SET_PHY_REPEATER1 0xf0014 /* 1.3 */ +#define DP_TRAINING_AUX_RD_INTERVAL_PHY_REPEATER1 0xf0020 /* 1.4a */ +#define DP_TRANSMITTER_CAPABILITY_PHY_REPEATER1 0xf0021 /* 1.4a */ +#define DP_LANE0_1_STATUS_PHY_REPEATER1 0xf0030 /* 1.3 */ +#define DP_LANE2_3_STATUS_PHY_REPEATER1 0xf0031 /* 1.3 */ +#define DP_LANE_ALIGN_STATUS_UPDATED_PHY_REPEATER1 0xf0032 /* 1.3 */ +#define DP_ADJUST_REQUEST_LANE0_1_PHY_REPEATER1 0xf0033 /* 1.3 */ +#define DP_ADJUST_REQUEST_LANE2_3_PHY_REPEATER1 0xf0034 /* 1.3 */ +#define DP_SYMBOL_ERROR_COUNT_LANE0_PHY_REPEATER1 0xf0035 /* 1.3 */ +#define DP_SYMBOL_ERROR_COUNT_LANE1_PHY_REPEATER1 0xf0037 /* 1.3 */ +#define DP_SYMBOL_ERROR_COUNT_LANE2_PHY_REPEATER1 0xf0039 /* 1.3 */ +#define DP_SYMBOL_ERROR_COUNT_LANE3_PHY_REPEATER1 0xf003b /* 1.3 */ +#define DP_FEC_STATUS_PHY_REPEATER1 0xf0290 /* 1.4 */ + /* DP HDCP message start offsets in DPCD address space */ #define DP_HDCP_2_2_AKE_INIT_OFFSET DP_HDCP_2_2_REG_RTX_OFFSET #define DP_HDCP_2_2_AKE_SEND_CERT_OFFSET DP_HDCP_2_2_REG_CERT_RX_OFFSET -- cgit v1.2.3 From af6219590b541418d3192e9bfa03989834ca0e78 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Wed, 9 Oct 2019 18:36:15 +0300 Subject: ASoC: simple_card_utils.h: Fix potential multiple redefinition error asoc_simple_debug_info and asoc_simple_debug_dai must be static otherwise we might a compilation error if the compiler decides not to inline the given function. Fixes: 0580dde59438686d ("ASoC: simple-card-utils: add asoc_simple_debug_info()") Signed-off-by: Daniel Baluta Link: https://lore.kernel.org/r/20191009153615.32105-3-daniel.baluta@nxp.com Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index 985a5f583de4..31f76b6abf71 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -135,9 +135,9 @@ int asoc_simple_init_priv(struct asoc_simple_priv *priv, struct link_info *li); #ifdef DEBUG -inline void asoc_simple_debug_dai(struct asoc_simple_priv *priv, - char *name, - struct asoc_simple_dai *dai) +static inline void asoc_simple_debug_dai(struct asoc_simple_priv *priv, + char *name, + struct asoc_simple_dai *dai) { struct device *dev = simple_priv_to_dev(priv); @@ -167,7 +167,7 @@ inline void asoc_simple_debug_dai(struct asoc_simple_priv *priv, dev_dbg(dev, "%s clk %luHz\n", name, clk_get_rate(dai->clk)); } -inline void asoc_simple_debug_info(struct asoc_simple_priv *priv) +static inline void asoc_simple_debug_info(struct asoc_simple_priv *priv) { struct snd_soc_card *card = simple_priv_to_card(priv); struct device *dev = simple_priv_to_dev(priv); -- cgit v1.2.3 From 8f6ea27b2fefdff58b3a038b5f1b4ed34aed3fd3 Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Wed, 9 Oct 2019 15:10:49 +0000 Subject: drm: two planes with the same zpos have undefined ordering Currently the property docs don't specify whether it's okay for two planes to have the same zpos value and what user-space should expect in this case. The unspoken, legacy rule used in the past was to make user-space figure out the zpos from object IDs. However some drivers break this rule, that's why the ordering is documented as unspecified in case the zpos property is missing. User-space should rely on the zpos property only. There are some cases in which user-space might read identical zpos values for different planes. For instance, in case the property is mutable, user-space might set two planes' zpos to the same value. This is necessary to support user-space using the legacy DRM API where atomic commits are not possible: user-space needs to update the planes' zpos one by one. Because of this, user-space should handle multiple planes with the same zpos. While at it, remove the assumption that zpos is only for overlay planes. Additionally, update the drm_plane_state.zpos docs to clarify that zpos disambiguation via plane object IDs is a recommendation for drivers, not something user-space can rely on. In other words, when user-space sets the same zpos on two planes, drivers should rely on the plane object ID. v2: clarify drm_plane_state.zpos docs (Daniel) v3: zpos is for all planes (Marius, Daniel) v4: completely reword the drm_plane_state.zpos docs to make it clear the recommendation to use plane IDs is for drivers in case user-space uses duplicate zpos values (Pekka) v5: reword commit message (Pekka, James) v6: remove mention of Arm GPUs having planes which can't overlap, because this isn't uAPI yet (Daniel) Signed-off-by: Simon Ser Reviewed-by: Pekka Paalanen Cc: Marius Vlad Cc: Daniel Vetter Cc: James Qian Wang Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/T5nHrvXH0GKOp6ONaFHk-j2cwEb4_4C_sBz9rNw8mmPACuut-DQqC74HMAFKZH3_Q15E8a3YnmKCxap-djKA71VVZv_T-tFxaB0he13O7yA=@emersion.fr --- drivers/gpu/drm/drm_blend.c | 8 ++++---- include/drm/drm_plane.h | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_blend.c b/drivers/gpu/drm/drm_blend.c index d02709dd2d4a..121481f6aa71 100644 --- a/drivers/gpu/drm/drm_blend.c +++ b/drivers/gpu/drm/drm_blend.c @@ -132,10 +132,10 @@ * planes. Without this property the primary plane is always below the cursor * plane, and ordering between all other planes is undefined. The positive * Z axis points towards the user, i.e. planes with lower Z position values - * are underneath planes with higher Z position values. Note that the Z - * position value can also be immutable, to inform userspace about the - * hard-coded stacking of overlay planes, see - * drm_plane_create_zpos_immutable_property(). + * are underneath planes with higher Z position values. Two planes with the + * same Z position value have undefined ordering. Note that the Z position + * value can also be immutable, to inform userspace about the hard-coded + * stacking of planes, see drm_plane_create_zpos_immutable_property(). * * pixel blend mode: * Pixel blend mode is set up with drm_plane_create_blend_mode_property(). diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h index cd5903ad33f7..328773690851 100644 --- a/include/drm/drm_plane.h +++ b/include/drm/drm_plane.h @@ -140,10 +140,11 @@ struct drm_plane_state { * @zpos: * Priority of the given plane on crtc (optional). * - * Note that multiple active planes on the same crtc can have an - * identical zpos value. The rule to solving the conflict is to compare - * the plane object IDs; the plane with a higher ID must be stacked on - * top of a plane with a lower ID. + * User-space may set mutable zpos properties so that multiple active + * planes on the same CRTC have identical zpos values. This is a + * user-space bug, but drivers can solve the conflict by comparing the + * plane object IDs; the plane with a higher ID is stacked on top of a + * plane with a lower ID. * * See drm_plane_create_zpos_property() and * drm_plane_create_zpos_immutable_property() for more details. -- cgit v1.2.3 From 819be8108fded0b9e710bbbf81193e52f7bab2f7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:09:23 +0800 Subject: sctp: add chunks to sk_backlog when the newsk sk_socket is not set This patch is to fix a NULL-ptr deref in selinux_socket_connect_helper: [...] kasan: GPF could be caused by NULL-ptr deref or user memory access [...] RIP: 0010:selinux_socket_connect_helper+0x94/0x460 [...] Call Trace: [...] selinux_sctp_bind_connect+0x16a/0x1d0 [...] security_sctp_bind_connect+0x58/0x90 [...] sctp_process_asconf+0xa52/0xfd0 [sctp] [...] sctp_sf_do_asconf+0x785/0x980 [sctp] [...] sctp_do_sm+0x175/0x5a0 [sctp] [...] sctp_assoc_bh_rcv+0x285/0x5b0 [sctp] [...] sctp_backlog_rcv+0x482/0x910 [sctp] [...] __release_sock+0x11e/0x310 [...] release_sock+0x4f/0x180 [...] sctp_accept+0x3f9/0x5a0 [sctp] [...] inet_accept+0xe7/0x720 It was caused by that the 'newsk' sk_socket was not set before going to security sctp hook when processing asconf chunk with SCTP_PARAM_ADD_IP or SCTP_PARAM_SET_PRIMARY: inet_accept()-> sctp_accept(): lock_sock(): lock listening 'sk' do_softirq(): sctp_rcv(): <-- [1] asconf chunk arrives and enqueued in 'sk' backlog sctp_sock_migrate(): set asoc's sk to 'newsk' release_sock(): sctp_backlog_rcv(): lock 'newsk' sctp_process_asconf() <-- [2] unlock 'newsk' sock_graft(): set sk_socket <-- [3] As it shows, at [1] the asconf chunk would be put into the listening 'sk' backlog, as accept() was holding its sock lock. Then at [2] asconf would get processed with 'newsk' as asoc's sk had been set to 'newsk'. However, 'newsk' sk_socket is not set until [3], while selinux_sctp_bind_connect() would deref it, then kernel crashed. Here to fix it by adding the chunk to sk_backlog until newsk sk_socket is set when .accept() is done. Note that sk->sk_socket can be NULL when the sock is closed, so SOCK_DEAD flag is also needed to check in sctp_newsk_ready(). Thanks to Ondrej for reviewing the code. Fixes: d452930fd3b9 ("selinux: Add SCTP support") Reported-by: Ying Xu Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/net/sctp/sctp.h | 5 +++++ net/sctp/input.c | 12 +++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 5d60f13d2347..3ab5c6bbb90b 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -610,4 +610,9 @@ static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); } +static inline bool sctp_newsk_ready(const struct sock *sk) +{ + return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; +} + #endif /* __net_sctp_h__ */ diff --git a/net/sctp/input.c b/net/sctp/input.c index 5a070fb5b278..f2771375bfc0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -243,7 +243,7 @@ int sctp_rcv(struct sk_buff *skb) bh_lock_sock(sk); } - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); @@ -321,7 +321,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) local_bh_disable(); bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) sctp_chunk_free(chunk); else @@ -336,7 +336,13 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) if (backloged) return 0; } else { - sctp_inq_push(inqueue, chunk); + if (!sctp_newsk_ready(sk)) { + if (!sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + return 0; + sctp_chunk_free(chunk); + } else { + sctp_inq_push(inqueue, chunk); + } } done: -- cgit v1.2.3 From 60b173ca3d1cd1782bd0096dc17298ec242f6fb1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:51:20 -0700 Subject: net: add {READ|WRITE}_ONCE() annotations on ->rskq_accept_head reqsk_queue_empty() is called from inet_csk_listen_poll() while other cpus might write ->rskq_accept_head value. Use {READ|WRITE}_ONCE() to avoid compiler tricks and potential KCSAN splats. Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- drivers/xen/pvcalls-back.c | 2 +- include/net/request_sock.h | 4 ++-- net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index 69a626b0e594..c57c71b7d53d 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -775,7 +775,7 @@ static int pvcalls_back_poll(struct xenbus_device *dev, mappass->reqcopy = *req; icsk = inet_csk(mappass->sock->sk); queue = &icsk->icsk_accept_queue; - data = queue->rskq_accept_head != NULL; + data = READ_ONCE(queue->rskq_accept_head) != NULL; if (data) { mappass->reqcopy.cmd = 0; ret = 0; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index fd178d58fa84..cf8b33213bbc 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -185,7 +185,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { - return queue->rskq_accept_head == NULL; + return READ_ONCE(queue->rskq_accept_head) == NULL; } static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, @@ -197,7 +197,7 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue req = queue->rskq_accept_head; if (req) { sk_acceptq_removed(parent); - queue->rskq_accept_head = req->dl_next; + WRITE_ONCE(queue->rskq_accept_head, req->dl_next); if (queue->rskq_accept_head == NULL) queue->rskq_accept_tail = NULL; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a9183543ca30..dbcf34ec8dd2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -934,7 +934,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; + WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; -- cgit v1.2.3 From 1f142c17d19a5618d5a633195a46f2c8be9bf232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:10:15 -0700 Subject: tcp: annotate lockless access to tcp_memory_pressure tcp_memory_pressure is read without holding any lock, and its value could be changed on other cpus. Use READ_ONCE() to annotate these lockless reads. The write side is already using atomic ops. Fixes: b8da51ebb1aa ("tcp: introduce tcp_under_memory_pressure()") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index c9a3f9688223..88e63d64c698 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -258,7 +258,7 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk) mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; - return tcp_memory_pressure; + return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f98a1882e537..888c92b63f5a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -326,7 +326,7 @@ void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; - if (tcp_memory_pressure) + if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; @@ -341,7 +341,7 @@ void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; - if (!tcp_memory_pressure) + if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) -- cgit v1.2.3 From eac66402d1c342f07ff38f8d631ff95eb7ad3220 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:32:35 -0700 Subject: net: annotate sk->sk_rcvlowat lockless reads sock_rcvlowat() or int_sk_rcvlowat() might be called without the socket lock for example from tcp_poll(). Use READ_ONCE() to document the fact that other cpus might change sk->sk_rcvlowat under us and avoid KCSAN splats. Use WRITE_ONCE() on write sides too. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 +++- net/core/filter.c | 2 +- net/core/sock.c | 2 +- net/ipv4/tcp.c | 2 +- net/sched/em_meta.c | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2c53f1a1d905..79f54e1f8827 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2271,7 +2271,9 @@ static inline long sock_sndtimeo(const struct sock *sk, bool noblock) static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { - return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; + int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); + + return v ?: 1; } /* Alas, with timeout socket operations are not restartable. diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..a50c0b6846f2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4274,7 +4274,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_MARK: if (sk->sk_mark != val) { diff --git a/net/core/sock.c b/net/core/sock.c index 1cf06934da50..b7c5c6ea51ba 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -974,7 +974,7 @@ set_rcvbuf: if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_RCVTIMEO_OLD: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 888c92b63f5a..8781a92ea4b6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1699,7 +1699,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) else cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; val = min(val, cap); - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 82bd14e7ac93..4c9122fc35c9 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat) *err = -1; return; } - dst->value = sk->sk_rcvlowat; + dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) -- cgit v1.2.3 From 70c2655849a25431f31b505a07fe0c861e5e41fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:41:03 -0700 Subject: net: silence KCSAN warnings about sk->sk_backlog.len reads sk->sk_backlog.len can be written by BH handlers, and read from process contexts in a lockless way. Note the write side should also use WRITE_ONCE() or a variant. We need some agreement about the best way to do this. syzbot reported : BUG: KCSAN: data-race in tcp_add_backlog / tcp_grow_window.isra.0 write to 0xffff88812665f32c of 4 bytes by interrupt on cpu 1: sk_add_backlog include/net/sock.h:934 [inline] tcp_add_backlog+0x4a0/0xcc0 net/ipv4/tcp_ipv4.c:1737 tcp_v4_rcv+0x1aba/0x1bf0 net/ipv4/tcp_ipv4.c:1925 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 read to 0xffff88812665f32c of 4 bytes by task 7292 on cpu 0: tcp_space include/net/tcp.h:1373 [inline] tcp_grow_window.isra.0+0x6b/0x480 net/ipv4/tcp_input.c:413 tcp_event_data_recv+0x68f/0x990 net/ipv4/tcp_input.c:717 tcp_rcv_established+0xbfe/0xf50 net/ipv4/tcp_input.c:5618 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2427 release_sock+0x61/0x160 net/core/sock.c:2943 tcp_recvmsg+0x63b/0x1a30 net/ipv4/tcp.c:2181 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7292 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 3 ++- net/core/sock.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 88e63d64c698..35f6f7e0fdc2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1380,7 +1380,8 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, sk->sk_rcvbuf - sk->sk_backlog.len - + return tcp_win_from_space(sk, sk->sk_rcvbuf - + READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } diff --git a/net/core/sock.c b/net/core/sock.c index b7c5c6ea51ba..2a053999df11 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3210,7 +3210,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index fc9a4c6629ce..0851166b9175 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -175,7 +175,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7c736cfec57f..f8bbc4aab213 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3790,7 +3790,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf); i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk)); i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf); - i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len); + i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len)); if (dqueues & TIPC_DUMP_SK_SNDQ) { i += scnprintf(buf + i, sz - i, "sk_write_queue: "); -- cgit v1.2.3 From 7cea855922cb6a84d56ee8f12783f5b5e14c6129 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Mon, 7 Oct 2019 19:21:48 +0000 Subject: drm/bridge: dw-hdmi: Add Dynamic Range and Mastering InfoFrame support Add support for configuring Dynamic Range and Mastering InfoFrame from the hdr_output_metadata connector property. This patch adds a use_drm_infoframe flag to dw_hdmi_plat_data that platform drivers use to signal when Dynamic Range and Mastering infoframes is supported. This flag is needed because Amlogic GXBB and GXL report same DW-HDMI version, and only GXL support DRM InfoFrame. These changes were based on work done by Zheng Yang to support DRM InfoFrame on the Rockchip 4.4 BSP kernel at [1] and [2] [1] https://github.com/rockchip-linux/kernel/tree/develop-4.4 [2] https://github.com/rockchip-linux/kernel/commit/d1943fde81ff41d7cca87f4a42f03992e90bddd5 Cc: Zheng Yang Signed-off-by: Jonas Karlman Reviewed-by: Neil Armstrong Reviewed-by: Andrzej Hajda Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/HE1PR06MB4011D7B916CBF8B740ACC45FAC9B0@HE1PR06MB4011.eurprd06.prod.outlook.com --- drivers/gpu/drm/bridge/synopsys/dw-hdmi.c | 81 +++++++++++++++++++++++++++++++ drivers/gpu/drm/bridge/synopsys/dw-hdmi.h | 37 ++++++++++++++ include/drm/bridge/dw_hdmi.h | 1 + 3 files changed, 119 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c index a15fbf71b9d7..fdc29869d75a 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -1743,6 +1744,41 @@ static void hdmi_config_vendor_specific_infoframe(struct dw_hdmi *hdmi, HDMI_FC_DATAUTO0_VSD_MASK); } +static void hdmi_config_drm_infoframe(struct dw_hdmi *hdmi) +{ + const struct drm_connector_state *conn_state = hdmi->connector.state; + struct hdmi_drm_infoframe frame; + u8 buffer[30]; + ssize_t err; + int i; + + if (!hdmi->plat_data->use_drm_infoframe) + return; + + hdmi_modb(hdmi, HDMI_FC_PACKET_TX_EN_DRM_DISABLE, + HDMI_FC_PACKET_TX_EN_DRM_MASK, HDMI_FC_PACKET_TX_EN); + + err = drm_hdmi_infoframe_set_hdr_metadata(&frame, conn_state); + if (err < 0) + return; + + err = hdmi_drm_infoframe_pack(&frame, buffer, sizeof(buffer)); + if (err < 0) { + dev_err(hdmi->dev, "Failed to pack drm infoframe: %zd\n", err); + return; + } + + hdmi_writeb(hdmi, frame.version, HDMI_FC_DRM_HB0); + hdmi_writeb(hdmi, frame.length, HDMI_FC_DRM_HB1); + + for (i = 0; i < frame.length; i++) + hdmi_writeb(hdmi, buffer[4 + i], HDMI_FC_DRM_PB0 + i); + + hdmi_writeb(hdmi, 1, HDMI_FC_DRM_UP); + hdmi_modb(hdmi, HDMI_FC_PACKET_TX_EN_DRM_ENABLE, + HDMI_FC_PACKET_TX_EN_DRM_MASK, HDMI_FC_PACKET_TX_EN); +} + static void hdmi_av_composer(struct dw_hdmi *hdmi, const struct drm_display_mode *mode) { @@ -2064,6 +2100,7 @@ static int dw_hdmi_setup(struct dw_hdmi *hdmi, struct drm_display_mode *mode) /* HDMI Initialization Step F - Configure AVI InfoFrame */ hdmi_config_AVI(hdmi, mode); hdmi_config_vendor_specific_infoframe(hdmi, mode); + hdmi_config_drm_infoframe(hdmi); } else { dev_dbg(hdmi->dev, "%s DVI mode\n", __func__); } @@ -2230,6 +2267,45 @@ static int dw_hdmi_connector_get_modes(struct drm_connector *connector) return ret; } +static bool hdr_metadata_equal(const struct drm_connector_state *old_state, + const struct drm_connector_state *new_state) +{ + struct drm_property_blob *old_blob = old_state->hdr_output_metadata; + struct drm_property_blob *new_blob = new_state->hdr_output_metadata; + + if (!old_blob || !new_blob) + return old_blob == new_blob; + + if (old_blob->length != new_blob->length) + return false; + + return !memcmp(old_blob->data, new_blob->data, old_blob->length); +} + +static int dw_hdmi_connector_atomic_check(struct drm_connector *connector, + struct drm_atomic_state *state) +{ + struct drm_connector_state *old_state = + drm_atomic_get_old_connector_state(state, connector); + struct drm_connector_state *new_state = + drm_atomic_get_new_connector_state(state, connector); + struct drm_crtc *crtc = new_state->crtc; + struct drm_crtc_state *crtc_state; + + if (!crtc) + return 0; + + if (!hdr_metadata_equal(old_state, new_state)) { + crtc_state = drm_atomic_get_crtc_state(state, crtc); + if (IS_ERR(crtc_state)) + return PTR_ERR(crtc_state); + + crtc_state->mode_changed = true; + } + + return 0; +} + static void dw_hdmi_connector_force(struct drm_connector *connector) { struct dw_hdmi *hdmi = container_of(connector, struct dw_hdmi, @@ -2254,6 +2330,7 @@ static const struct drm_connector_funcs dw_hdmi_connector_funcs = { static const struct drm_connector_helper_funcs dw_hdmi_connector_helper_funcs = { .get_modes = dw_hdmi_connector_get_modes, + .atomic_check = dw_hdmi_connector_atomic_check, }; static int dw_hdmi_bridge_attach(struct drm_bridge *bridge) @@ -2274,6 +2351,10 @@ static int dw_hdmi_bridge_attach(struct drm_bridge *bridge) DRM_MODE_CONNECTOR_HDMIA, hdmi->ddc); + if (hdmi->version >= 0x200a && hdmi->plat_data->use_drm_infoframe) + drm_object_attach_property(&connector->base, + connector->dev->mode_config.hdr_output_metadata_property, 0); + drm_connector_attach_encoder(connector, encoder); cec_fill_conn_info_from_drm(&conn_info, connector); diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.h b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.h index fcff5059db24..1999db05bc3b 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.h +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.h @@ -254,6 +254,7 @@ #define HDMI_FC_POL2 0x10DB #define HDMI_FC_PRCONF 0x10E0 #define HDMI_FC_SCRAMBLER_CTRL 0x10E1 +#define HDMI_FC_PACKET_TX_EN 0x10E3 #define HDMI_FC_GMD_STAT 0x1100 #define HDMI_FC_GMD_EN 0x1101 @@ -289,6 +290,37 @@ #define HDMI_FC_GMD_PB26 0x111F #define HDMI_FC_GMD_PB27 0x1120 +#define HDMI_FC_DRM_UP 0x1167 +#define HDMI_FC_DRM_HB0 0x1168 +#define HDMI_FC_DRM_HB1 0x1169 +#define HDMI_FC_DRM_PB0 0x116A +#define HDMI_FC_DRM_PB1 0x116B +#define HDMI_FC_DRM_PB2 0x116C +#define HDMI_FC_DRM_PB3 0x116D +#define HDMI_FC_DRM_PB4 0x116E +#define HDMI_FC_DRM_PB5 0x116F +#define HDMI_FC_DRM_PB6 0x1170 +#define HDMI_FC_DRM_PB7 0x1171 +#define HDMI_FC_DRM_PB8 0x1172 +#define HDMI_FC_DRM_PB9 0x1173 +#define HDMI_FC_DRM_PB10 0x1174 +#define HDMI_FC_DRM_PB11 0x1175 +#define HDMI_FC_DRM_PB12 0x1176 +#define HDMI_FC_DRM_PB13 0x1177 +#define HDMI_FC_DRM_PB14 0x1178 +#define HDMI_FC_DRM_PB15 0x1179 +#define HDMI_FC_DRM_PB16 0x117A +#define HDMI_FC_DRM_PB17 0x117B +#define HDMI_FC_DRM_PB18 0x117C +#define HDMI_FC_DRM_PB19 0x117D +#define HDMI_FC_DRM_PB20 0x117E +#define HDMI_FC_DRM_PB21 0x117F +#define HDMI_FC_DRM_PB22 0x1180 +#define HDMI_FC_DRM_PB23 0x1181 +#define HDMI_FC_DRM_PB24 0x1182 +#define HDMI_FC_DRM_PB25 0x1183 +#define HDMI_FC_DRM_PB26 0x1184 + #define HDMI_FC_DBGFORCE 0x1200 #define HDMI_FC_DBGAUD0CH0 0x1201 #define HDMI_FC_DBGAUD1CH0 0x1202 @@ -744,6 +776,11 @@ enum { HDMI_FC_PRCONF_OUTPUT_PR_FACTOR_MASK = 0x0F, HDMI_FC_PRCONF_OUTPUT_PR_FACTOR_OFFSET = 0, +/* FC_PACKET_TX_EN field values */ + HDMI_FC_PACKET_TX_EN_DRM_MASK = 0x80, + HDMI_FC_PACKET_TX_EN_DRM_ENABLE = 0x80, + HDMI_FC_PACKET_TX_EN_DRM_DISABLE = 0x00, + /* FC_AVICONF0-FC_AVICONF3 field values */ HDMI_FC_AVICONF0_PIX_FMT_MASK = 0x03, HDMI_FC_AVICONF0_PIX_FMT_RGB = 0x00, diff --git a/include/drm/bridge/dw_hdmi.h b/include/drm/bridge/dw_hdmi.h index 4b3e863c4f8a..fbf3812c4326 100644 --- a/include/drm/bridge/dw_hdmi.h +++ b/include/drm/bridge/dw_hdmi.h @@ -126,6 +126,7 @@ struct dw_hdmi_plat_data { const struct drm_display_mode *mode); unsigned long input_bus_format; unsigned long input_bus_encoding; + bool use_drm_infoframe; /* Vendor PHY support */ const struct dw_hdmi_phy_ops *phy_ops; -- cgit v1.2.3 From fec748740c9ca362dff7a5831105b0edc333bf91 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 10 Oct 2019 13:19:13 +0200 Subject: drm/plane: Clarify our expectations for src/dst rectangles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rectangles are usually clipped, but it can be useful to have them unclipped, for example for cursor planes. Signed-off-by: Maarten Lankhorst [mlankhorst: Change cursor plane to hardware performing clipping. (Ville) Fix dst description that went missing.] Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20191010112918.15724-1-maarten.lankhorst@linux.intel.com --- include/drm/drm_plane.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h index 328773690851..3f396d94afe4 100644 --- a/include/drm/drm_plane.h +++ b/include/drm/drm_plane.h @@ -184,8 +184,26 @@ struct drm_plane_state { */ struct drm_property_blob *fb_damage_clips; - /** @src: clipped source coordinates of the plane (in 16.16) */ - /** @dst: clipped destination coordinates of the plane */ + /** + * @src: + * + * source coordinates of the plane (in 16.16). + * + * When using drm_atomic_helper_check_plane_state(), + * the coordinates are clipped, but the driver may choose + * to use unclipped coordinates instead when the hardware + * performs the clipping automatically. + */ + /** + * @dst: + * + * clipped destination coordinates of the plane. + * + * When using drm_atomic_helper_check_plane_state(), + * the coordinates are clipped, but the driver may choose + * to use unclipped coordinates instead when the hardware + * performs the clipping automatically. + */ struct drm_rect src, dst; /** -- cgit v1.2.3 From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 2 Oct 2019 10:40:55 -0400 Subject: SUNRPC: fix race to sk_err after xs_error_report Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") there has been a race to the value of the sk_err if both XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set. In that case, we may end up losing the sk_err value that existed when xs_error_report was called. Fix this by reverting to the previous behavior: instead of using SO_ERROR to retrieve the value at a later time (which might also return sk_err_soft), copy the sk_err value onto struct sock_xprt, and use that value to wake pending tasks. Signed-off-by: Benjamin Coddington Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 7638dbe7bc50..a940de03808d 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -61,6 +61,7 @@ struct sock_xprt { struct mutex recv_mutex; struct sockaddr_storage srcaddr; unsigned short srcport; + int xprt_err; /* * UDP socket buffer size parameters diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9ac88722fa83..70e52f567b2a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1249,19 +1249,21 @@ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; - int err; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; transport = container_of(xprt, struct sock_xprt, xprt); - err = -sk->sk_err; - if (err == 0) + transport->xprt_err = -sk->sk_err; + if (transport->xprt_err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", - xprt, -err); - trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt, -transport->xprt_err); + trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); + + /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ + smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); @@ -2476,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport) static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - int sockerr_len = sizeof(sockerr); if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; @@ -2485,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport) goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) goto out; - if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, - (char *)&sockerr, &sockerr_len) != 0) - goto out; + sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) xprt_wake_pending_tasks(&transport->xprt, sockerr); out: -- cgit v1.2.3 From 9b7117e245bcf37b6e9f87f61461168e09f25316 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Thu, 10 Oct 2019 13:59:56 +0200 Subject: drm/omap: cleanup OMAP_BO flags Reorder OMAP_BO flags and improve the comments. Signed-off-by: Tomi Valkeinen Reviewed-by: Jean-Jacques Hiblot Link: https://patchwork.freedesktop.org/patch/msgid/20191010120000.1421-5-jjhiblot@ti.com --- include/uapi/drm/omap_drm.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/drm/omap_drm.h b/include/uapi/drm/omap_drm.h index 1fccffef9e27..d8ee2f840697 100644 --- a/include/uapi/drm/omap_drm.h +++ b/include/uapi/drm/omap_drm.h @@ -38,19 +38,20 @@ struct drm_omap_param { __u64 value; /* in (set_param), out (get_param) */ }; -#define OMAP_BO_SCANOUT 0x00000001 /* scanout capable (phys contiguous) */ -#define OMAP_BO_CACHE_MASK 0x00000006 /* cache type mask, see cache modes */ -#define OMAP_BO_TILED_MASK 0x00000f00 /* tiled mapping mask, see tiled modes */ +/* Scanout buffer, consumable by DSS */ +#define OMAP_BO_SCANOUT 0x00000001 -/* cache modes */ -#define OMAP_BO_CACHED 0x00000000 /* default */ -#define OMAP_BO_WC 0x00000002 /* write-combine */ -#define OMAP_BO_UNCACHED 0x00000004 /* strongly-ordered (uncached) */ +/* Buffer CPU caching mode: cached, write-combining or uncached. */ +#define OMAP_BO_CACHED 0x00000000 +#define OMAP_BO_WC 0x00000002 +#define OMAP_BO_UNCACHED 0x00000004 +#define OMAP_BO_CACHE_MASK 0x00000006 -/* tiled modes */ +/* Use TILER for the buffer. The TILER container unit can be 8, 16 or 32 bits. */ #define OMAP_BO_TILED_8 0x00000100 #define OMAP_BO_TILED_16 0x00000200 #define OMAP_BO_TILED_32 0x00000300 +#define OMAP_BO_TILED_MASK 0x00000f00 #define OMAP_BO_TILED (OMAP_BO_TILED_8 | OMAP_BO_TILED_16 | OMAP_BO_TILED_32) union omap_gem_size { -- cgit v1.2.3 From 48b34ac041756c2fd3a898d6e96be97416858b45 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Thu, 10 Oct 2019 13:59:57 +0200 Subject: drm/omap: remove OMAP_BO_TILED define OMAP_BO_TILED does not make sense, as OMAP_BO_TILED_* values are not bitmasks but normal values. As we already have OMAP_BO_TILED_MASK for the mask, we can remove OMAP_BO_TILED and use OMAP_BO_TILED_MASK instead. Signed-off-by: Tomi Valkeinen Reviewed-by: Jean-Jacques Hiblot Link: https://patchwork.freedesktop.org/patch/msgid/20191010120000.1421-6-jjhiblot@ti.com --- drivers/gpu/drm/omapdrm/omap_dmm_tiler.h | 2 +- drivers/gpu/drm/omapdrm/omap_fb.c | 6 +++--- drivers/gpu/drm/omapdrm/omap_gem.c | 18 +++++++++--------- drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 2 +- include/uapi/drm/omap_drm.h | 1 - 5 files changed, 14 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.h b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.h index 835e6654fa82..43c1d096b021 100644 --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.h +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.h @@ -113,7 +113,7 @@ extern struct platform_driver omap_dmm_driver; /* GEM bo flags -> tiler fmt */ static inline enum tiler_fmt gem2fmt(u32 flags) { - switch (flags & OMAP_BO_TILED) { + switch (flags & OMAP_BO_TILED_MASK) { case OMAP_BO_TILED_8: return TILFMT_8BIT; case OMAP_BO_TILED_16: diff --git a/drivers/gpu/drm/omapdrm/omap_fb.c b/drivers/gpu/drm/omapdrm/omap_fb.c index 1b8b5108caf8..7403316088b8 100644 --- a/drivers/gpu/drm/omapdrm/omap_fb.c +++ b/drivers/gpu/drm/omapdrm/omap_fb.c @@ -95,7 +95,7 @@ static u32 get_linear_addr(struct drm_framebuffer *fb, bool omap_framebuffer_supports_rotation(struct drm_framebuffer *fb) { - return omap_gem_flags(fb->obj[0]) & OMAP_BO_TILED; + return omap_gem_flags(fb->obj[0]) & OMAP_BO_TILED_MASK; } /* Note: DRM rotates counter-clockwise, TILER & DSS rotates clockwise */ @@ -154,7 +154,7 @@ void omap_framebuffer_update_scanout(struct drm_framebuffer *fb, x = state->src_x >> 16; y = state->src_y >> 16; - if (omap_gem_flags(fb->obj[0]) & OMAP_BO_TILED) { + if (omap_gem_flags(fb->obj[0]) & OMAP_BO_TILED_MASK) { u32 w = state->src_w >> 16; u32 h = state->src_h >> 16; @@ -212,7 +212,7 @@ void omap_framebuffer_update_scanout(struct drm_framebuffer *fb, plane = &omap_fb->planes[1]; if (info->rotation_type == OMAP_DSS_ROT_TILER) { - WARN_ON(!(omap_gem_flags(fb->obj[1]) & OMAP_BO_TILED)); + WARN_ON(!(omap_gem_flags(fb->obj[1]) & OMAP_BO_TILED_MASK)); omap_gem_rotated_dma_addr(fb->obj[1], orient, x/2, y/2, &info->p_uv_addr); } else { diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index a6562d23d314..4e8fcfdff3a0 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -196,7 +196,7 @@ static void omap_gem_evict(struct drm_gem_object *obj) struct omap_gem_object *omap_obj = to_omap_bo(obj); struct omap_drm_private *priv = obj->dev->dev_private; - if (omap_obj->flags & OMAP_BO_TILED) { + if (omap_obj->flags & OMAP_BO_TILED_MASK) { enum tiler_fmt fmt = gem2fmt(omap_obj->flags); int i; @@ -324,7 +324,7 @@ size_t omap_gem_mmap_size(struct drm_gem_object *obj) struct omap_gem_object *omap_obj = to_omap_bo(obj); size_t size = obj->size; - if (omap_obj->flags & OMAP_BO_TILED) { + if (omap_obj->flags & OMAP_BO_TILED_MASK) { /* for tiled buffers, the virtual size has stride rounded up * to 4kb.. (to hide the fact that row n+1 might start 16kb or * 32kb later!). But we don't back the entire buffer with @@ -513,7 +513,7 @@ vm_fault_t omap_gem_fault(struct vm_fault *vmf) * probably trigger put_pages()? */ - if (omap_obj->flags & OMAP_BO_TILED) + if (omap_obj->flags & OMAP_BO_TILED_MASK) ret = omap_gem_fault_2d(obj, vma, vmf); else ret = omap_gem_fault_1d(obj, vma, vmf); @@ -786,7 +786,7 @@ int omap_gem_pin(struct drm_gem_object *obj, dma_addr_t *dma_addr) if (ret) goto fail; - if (omap_obj->flags & OMAP_BO_TILED) { + if (omap_obj->flags & OMAP_BO_TILED_MASK) { block = tiler_reserve_2d(fmt, omap_obj->width, omap_obj->height, 0); @@ -892,7 +892,7 @@ int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient, mutex_lock(&omap_obj->lock); if ((refcount_read(&omap_obj->dma_addr_cnt) > 0) && omap_obj->block && - (omap_obj->flags & OMAP_BO_TILED)) { + (omap_obj->flags & OMAP_BO_TILED_MASK)) { *dma_addr = tiler_tsptr(omap_obj->block, orient, x, y); ret = 0; } @@ -907,7 +907,7 @@ int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient) { struct omap_gem_object *omap_obj = to_omap_bo(obj); int ret = -EINVAL; - if (omap_obj->flags & OMAP_BO_TILED) + if (omap_obj->flags & OMAP_BO_TILED_MASK) ret = tiler_stride(gem2fmt(omap_obj->flags), orient); return ret; } @@ -1046,7 +1046,7 @@ void omap_gem_describe(struct drm_gem_object *obj, struct seq_file *m) refcount_read(&omap_obj->dma_addr_cnt), omap_obj->vaddr, omap_obj->roll); - if (omap_obj->flags & OMAP_BO_TILED) { + if (omap_obj->flags & OMAP_BO_TILED_MASK) { seq_printf(m, " %dx%d", omap_obj->width, omap_obj->height); if (omap_obj->block) { struct tcm_area *area = &omap_obj->block->area; @@ -1145,7 +1145,7 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev, int ret; /* Validate the flags and compute the memory and cache flags. */ - if (flags & OMAP_BO_TILED) { + if (flags & OMAP_BO_TILED_MASK) { if (!priv->usergart) { dev_err(dev->dev, "Tiled buffers require DMM\n"); return NULL; @@ -1187,7 +1187,7 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev, omap_obj->flags = flags; mutex_init(&omap_obj->lock); - if (flags & OMAP_BO_TILED) { + if (flags & OMAP_BO_TILED_MASK) { /* * For tiled buffers align dimensions to slot boundaries and * calculate size based on aligned dimensions. diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c index e8c3ae7ac77e..7344bb61936c 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c @@ -67,7 +67,7 @@ static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer, { struct drm_gem_object *obj = buffer->priv; struct page **pages; - if (omap_gem_flags(obj) & OMAP_BO_TILED) { + if (omap_gem_flags(obj) & OMAP_BO_TILED_MASK) { /* TODO we would need to pin at least part of the buffer to * get de-tiled view. For now just reject it. */ diff --git a/include/uapi/drm/omap_drm.h b/include/uapi/drm/omap_drm.h index d8ee2f840697..5a142fad473c 100644 --- a/include/uapi/drm/omap_drm.h +++ b/include/uapi/drm/omap_drm.h @@ -52,7 +52,6 @@ struct drm_omap_param { #define OMAP_BO_TILED_16 0x00000200 #define OMAP_BO_TILED_32 0x00000300 #define OMAP_BO_TILED_MASK 0x00000f00 -#define OMAP_BO_TILED (OMAP_BO_TILED_8 | OMAP_BO_TILED_16 | OMAP_BO_TILED_32) union omap_gem_size { __u32 bytes; /* (for non-tiled formats) */ -- cgit v1.2.3 From 23b482252836ab3c5e6b3b20ed3038449cbc7679 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Thu, 10 Oct 2019 14:00:00 +0200 Subject: drm/omap: add OMAP_BO flags to affect buffer allocation On SoCs with DMM/TILER, we have two ways to allocate buffers: normal dma_alloc or via DMM (which basically functions as an IOMMU). DMM can map 128MB at a time, and we only map the DMM buffers when they are used (i.e. not at alloc time). If DMM is present, omapdrm always uses DMM. There are use cases that require lots of big buffers that are being used at the same time by different IPs. At the moment the userspace has a hard maximum of 128MB. This patch adds three new flags that can be used by the userspace to solve the situation: OMAP_BO_MEM_CONTIG: The driver will use dma_alloc to get the memory. This can be used to avoid DMM if the userspace knows it needs more than 128M of memory at the same time. OMAP_BO_MEM_DMM: The driver will use DMM to get the memory. There's not much use for this flag at the moment, as on platforms with DMM it is used by default, but it's here for completeness. OMAP_BO_MEM_PIN: The driver will pin the memory at alloc time, and keep it pinned. This can be used to 1) get an error at alloc time if DMM space is full, and 2) get rid of the constant pin/unpin operations which may have some effect on performance. If none of the flags are given, the behavior is the same as currently. Signed-off-by: Tomi Valkeinen Reviewed-by: Jean-Jacques Hiblot Link: https://patchwork.freedesktop.org/patch/msgid/20191010120000.1421-9-jjhiblot@ti.com --- drivers/gpu/drm/omapdrm/omap_gem.c | 54 ++++++++++++++++++++++++++++++++++++-- include/uapi/drm/omap_drm.h | 9 +++++++ 2 files changed, 61 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index e518d93ca6df..bf18dfe2b689 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -1097,6 +1097,9 @@ void omap_gem_free_object(struct drm_gem_object *obj) list_del(&omap_obj->mm_list); mutex_unlock(&priv->list_lock); + if (omap_obj->flags & OMAP_BO_MEM_PIN) + omap_gem_unpin_locked(obj); + /* * We own the sole reference to the object at this point, but to keep * lockdep happy, we must still take the omap_obj_lock to call @@ -1147,10 +1150,19 @@ static bool omap_gem_validate_flags(struct drm_device *dev, u32 flags) return false; } + if ((flags & OMAP_BO_MEM_CONTIG) && (flags & OMAP_BO_MEM_DMM)) + return false; + + if ((flags & OMAP_BO_MEM_DMM) && !priv->usergart) + return false; + if (flags & OMAP_BO_TILED_MASK) { if (!priv->usergart) return false; + if (flags & OMAP_BO_MEM_CONTIG) + return false; + switch (flags & OMAP_BO_TILED_MASK) { case OMAP_BO_TILED_8: case OMAP_BO_TILED_16: @@ -1165,7 +1177,34 @@ static bool omap_gem_validate_flags(struct drm_device *dev, u32 flags) return true; } -/* GEM buffer object constructor */ +/** + * omap_gem_new() - Create a new GEM buffer + * @dev: The DRM device + * @gsize: The requested size for the GEM buffer. If the buffer is tiled + * (2D buffer), the size is a pair of values: height and width + * expressed in pixels. If the buffers is not tiled, it is expressed + * in bytes. + * @flags: Flags give additionnal information about the allocation: + * OMAP_BO_TILED_x: use the TILER (2D buffers). The TILER container + * unit can be 8, 16 or 32 bits. Cache is always disabled for + * tiled buffers. + * OMAP_BO_SCANOUT: Scannout buffer, consummable by the DSS + * OMAP_BO_CACHED: Buffer CPU caching mode: cached + * OMAP_BO_WC: Buffer CPU caching mode: write-combined + * OMAP_BO_UNCACHED: Buffer CPU caching mode: uncached + * OMAP_BO_MEM_CONTIG: The driver will use dma_alloc to get the memory. + * This can be used to avoid DMM if the userspace knows it needs + * more than 128M of memory at the same time. + * OMAP_BO_MEM_DMM: The driver will use DMM to get the memory. There's + * not much use for this flag at the moment, as on platforms with + * DMM it is used by default, but it's here for completeness. + * OMAP_BO_MEM_PIN: The driver will pin the memory at alloc time, and + * keep it pinned. This can be used to 1) get an error at alloc + * time if DMM space is full, and 2) get rid of the constant + * pin/unpin operations which may have some effect on performance. + * + * Return: The GEM buffer or NULL if the allocation failed + */ struct drm_gem_object *omap_gem_new(struct drm_device *dev, union omap_gem_size gsize, u32 flags) { @@ -1193,7 +1232,8 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev, */ flags &= ~(OMAP_BO_CACHED|OMAP_BO_WC|OMAP_BO_UNCACHED); flags |= tiler_get_cpu_cache_flags(); - } else if ((flags & OMAP_BO_SCANOUT) && !priv->has_dmm) { + } else if ((flags & OMAP_BO_MEM_CONTIG) || + ((flags & OMAP_BO_SCANOUT) && !priv->has_dmm)) { /* * If we don't have DMM, we must allocate scanout buffers * from contiguous DMA memory. @@ -1253,12 +1293,22 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev, goto err_release; } + if (flags & OMAP_BO_MEM_PIN) { + ret = omap_gem_pin(obj, NULL); + if (ret) + goto err_free_dma; + } + mutex_lock(&priv->list_lock); list_add(&omap_obj->mm_list, &priv->obj_list); mutex_unlock(&priv->list_lock); return obj; +err_free_dma: + if (flags & OMAP_BO_MEM_DMA_API) + dma_free_wc(dev->dev, size, omap_obj->vaddr, + omap_obj->dma_addr); err_release: drm_gem_object_release(obj); err_free: diff --git a/include/uapi/drm/omap_drm.h b/include/uapi/drm/omap_drm.h index 5a142fad473c..842d3180a442 100644 --- a/include/uapi/drm/omap_drm.h +++ b/include/uapi/drm/omap_drm.h @@ -47,6 +47,15 @@ struct drm_omap_param { #define OMAP_BO_UNCACHED 0x00000004 #define OMAP_BO_CACHE_MASK 0x00000006 +/* Force allocation from contiguous DMA memory */ +#define OMAP_BO_MEM_CONTIG 0x00000008 + +/* Force allocation via DMM */ +#define OMAP_BO_MEM_DMM 0x00000010 + +/* Pin the buffer when allocating and keep pinned */ +#define OMAP_BO_MEM_PIN 0x00000020 + /* Use TILER for the buffer. The TILER container unit can be 8, 16 or 32 bits. */ #define OMAP_BO_TILED_8 0x00000100 #define OMAP_BO_TILED_16 0x00000200 -- cgit v1.2.3 From 294f69e662d1570703e9b56e95be37a9fd3afba5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Oct 2019 09:46:42 -0700 Subject: compiler_attributes.h: Add 'fallthrough' pseudo keyword for switch/case use Reserve the pseudo keyword 'fallthrough' for the ability to convert the various case block /* fallthrough */ style comments to appear to be an actual reserved word with the same gcc case block missing fallthrough warning capability. All switch/case blocks now should end in one of: break; fallthrough; goto