From 2c57a0e233d72f8c2e2404560dcf0188ac3cf5d7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 16 Apr 2015 12:43:13 -0700 Subject: lib: find_*_bit reimplementation This patchset does rework to find_bit function family to achieve better performance, and decrease size of text. All rework is done in patch 1. Patches 2 and 3 are about code moving and renaming. It was boot-tested on x86_64 and MIPS (big-endian) machines. Performance tests were ran on userspace with code like this: /* addr[] is filled from /dev/urandom */ start = clock(); while (ret < nbits) ret = find_next_bit(addr, nbits, ret + 1); end = clock(); printf("%ld\t", (unsigned long) end - start); On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz measurements are: (for find_next_bit, nbits is 8M, for find_first_bit - 80K) find_next_bit: find_first_bit: new current new current 26932 43151 14777 14925 26947 43182 14521 15423 26507 43824 15053 14705 27329 43759 14473 14777 26895 43367 14847 15023 26990 43693 15103 15163 26775 43299 15067 15232 27282 42752 14544 15121 27504 43088 14644 14858 26761 43856 14699 15193 26692 43075 14781 14681 27137 42969 14451 15061 ... ... find_next_bit performance gain is 35-40%; find_first_bit - no measurable difference. On ARM machine, there is arch-specific implementation for find_bit. Thanks a lot to George Spelvin and Rasmus Villemoes for hints and helpful discussions. This patch (of 3): New implementations takes less space in source file (see diffstat) and in object. For me it's 710 vs 453 bytes of text. It also shows better performance. find_last_bit description fixed due to obvious typo. [akpm@linux-foundation.org: include linux/bitmap.h, per Rasmus] Signed-off-by: Yury Norov Reviewed-by: Rasmus Villemoes Reviewed-by: George Spelvin Cc: Alexey Klimov Cc: David S. Miller Cc: Daniel Borkmann Cc: Hannes Frederic Sowa Cc: Lai Jiangshan Cc: Mark Salter Cc: AKASHI Takahiro Cc: Thomas Graf Cc: Valentin Rothberg Cc: Chris Wilson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 5d858e02997f..297f5bda4fdf 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word) /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at - * @size: The maximum size to search + * @size: The number of bits to search * - * Returns the bit number of the first set bit, or size. + * Returns the bit number of the last set bit, or size. */ extern unsigned long find_last_bit(const unsigned long *addr, unsigned long size); -- cgit v1.2.3 From 95d119528b0b8440a63bc13904e9873fc3a70503 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 16 Apr 2015 12:43:28 -0700 Subject: util_macros.h: add find_closest() macro This series unduplicates the code used to find the member in an array closest to 'x'. The first patch adds a macro implementing the algorithm in two flavors - for arrays sorted in ascending and descending order. The second updates Documentation/CodingStyle on the naming convention for local variables in macros resembling functions. Other three patches replace duplicated code with calls to one of these macros in some hwmon drivers. This patch (of 5): Searching for the member of an array closest to 'x' is duplicated in several places. Add a new include - util_macros.h - and two macros that implement this algorithm for arrays sorted both in ascending and descending order. Uses linear search. Signed-off-by: Bartosz Golaszewski Cc: Guenter Roeck Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/util_macros.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 include/linux/util_macros.h (limited to 'include') diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h new file mode 100644 index 000000000000..d5f4fb69dba3 --- /dev/null +++ b/include/linux/util_macros.h @@ -0,0 +1,40 @@ +#ifndef _LINUX_HELPER_MACROS_H_ +#define _LINUX_HELPER_MACROS_H_ + +#define __find_closest(x, a, as, op) \ +({ \ + typeof(as) __fc_i, __fc_as = (as) - 1; \ + typeof(x) __fc_x = (x); \ + typeof(*a) *__fc_a = (a); \ + for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \ + if (__fc_x op DIV_ROUND_CLOSEST(__fc_a[__fc_i] + \ + __fc_a[__fc_i + 1], 2)) \ + break; \ + } \ + (__fc_i); \ +}) + +/** + * find_closest - locate the closest element in a sorted array + * @x: The reference value. + * @a: The array in which to look for the closest element. Must be sorted + * in ascending order. + * @as: Size of 'a'. + * + * Returns the index of the element closest to 'x'. + */ +#define find_closest(x, a, as) __find_closest(x, a, as, <=) + +/** + * find_closest_descending - locate the closest element in a sorted array + * @x: The reference value. + * @a: The array in which to look for the closest element. Must be sorted + * in descending order. + * @as: Size of 'a'. + * + * Similar to find_closest() but 'a' is expected to be sorted in descending + * order. + */ +#define find_closest_descending(x, a, as) __find_closest(x, a, as, >=) + +#endif -- cgit v1.2.3 From f766093ecb647f5a87bfa456715abbbccee547ce Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 16 Apr 2015 12:43:45 -0700 Subject: kernel.h: implement DIV_ROUND_CLOSEST_ULL We have grown a number of different implementations of DIV_ROUND_CLOSEST_ULL throughout the kernel. Move the i915 one to kernel.h so that it can be reused. Signed-off-by: Javi Merino Reviewed-by: Jeff Epler Cc: Jani Nikula Cc: David Airlie Cc: Guenter Roeck Acked-by: Daniel Vetter Cc: "Rafael J. Wysocki" Cc: Alex Elder Cc: Antti Palosaari Cc: Javi Merino Cc: Mauro Carvalho Chehab Cc: Mel Gorman Cc: Mike Turquette Cc: Stephen Boyd Cc: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/intel_drv.h | 3 --- drivers/gpu/drm/i915/intel_panel.c | 1 + include/linux/kernel.h | 12 ++++++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index eef79ccd0b7c..ba243db35840 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -36,9 +36,6 @@ #include #include -#define DIV_ROUND_CLOSEST_ULL(ll, d) \ -({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) - /** * _wait_for - magic (register) wait macro * diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index d8686ce89160..08532d4ffe0a 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -30,6 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include "intel_drv.h" diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6d630d31ef3..3a5b48e52a9e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -103,6 +103,18 @@ (((__x) - ((__d) / 2)) / (__d)); \ } \ ) +/* + * Same as above but for u64 dividends. divisor must be a 32-bit + * number. + */ +#define DIV_ROUND_CLOSEST_ULL(x, divisor)( \ +{ \ + typeof(divisor) __d = divisor; \ + unsigned long long _tmp = (x) + (__d) / 2; \ + do_div(_tmp, __d); \ + _tmp; \ +} \ +) /* * Multiplies an integer by a fraction, while avoiding unnecessary -- cgit v1.2.3 From 2afe27c718b669b551895595873611ac39cc31e3 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 16 Apr 2015 12:44:00 -0700 Subject: lib/bitmap.c: bitmap_[empty,full]: remove code duplication bitmap_empty() has its own implementation. But it's clearly as simple as: find_first_bit(src, nbits) == nbits The same is true for 'bitmap_full'. Signed-off-by: Yury Norov Cc: George Spelvin Cc: Alexey Klimov Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 8 ++++---- lib/bitmap.c | 30 ------------------------------ 2 files changed, 4 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index be4fa5ddf36c..ea17cca9e685 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -283,16 +283,16 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); - else - return __bitmap_empty(src, nbits); + + return find_first_bit(src, nbits) == nbits; } static inline int bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); - else - return __bitmap_full(src, nbits); + + return find_first_zero_bit(src, nbits) == nbits; } static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) diff --git a/lib/bitmap.c b/lib/bitmap.c index d456f4c15a9f..64c0926f5dd8 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -42,36 +42,6 @@ * for the best explanations of this ordering. */ -int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) -{ - unsigned int k, lim = bits/BITS_PER_LONG; - for (k = 0; k < lim; ++k) - if (bitmap[k]) - return 0; - - if (bits % BITS_PER_LONG) - if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) - return 0; - - return 1; -} -EXPORT_SYMBOL(__bitmap_empty); - -int __bitmap_full(const unsigned long *bitmap, unsigned int bits) -{ - unsigned int k, lim = bits/BITS_PER_LONG; - for (k = 0; k < lim; ++k) - if (~bitmap[k]) - return 0; - - if (bits % BITS_PER_LONG) - if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) - return 0; - - return 1; -} -EXPORT_SYMBOL(__bitmap_full); - int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { -- cgit v1.2.3 From e15f431fe2d53cd4673510736da7d4fa1090e096 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 16 Apr 2015 12:44:47 -0700 Subject: errno.h: Improve ENOSYS's comment ENOSYS is the mechanism used by user code to detect whether the running kernel implements a given system call. It should not be returned by anything except an unimplemented system call. Unfortunately, it is rather frequently used in the kernel to indicate that various new functions of existing system calls are not implemented. This should be discouraged. Improve the comment in errno.h to help clarify ENOSYS's purpose. Signed-off-by: Andy Lutomirski Cc: Pavel Machek Cc: Michael Kerrisk Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/errno.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h index 1e1ea6e6e7a5..88e0914cf2d9 100644 --- a/include/uapi/asm-generic/errno.h +++ b/include/uapi/asm-generic/errno.h @@ -6,7 +6,16 @@ #define EDEADLK 35 /* Resource deadlock would occur */ #define ENAMETOOLONG 36 /* File name too long */ #define ENOLCK 37 /* No record locks available */ -#define ENOSYS 38 /* Function not implemented */ + +/* + * This error code is special: arch syscall entry code will return + * -ENOSYS if users try to call a syscall that doesn't exist. To keep + * failures of syscalls that really do exist distinguishable from + * failures due to attempts to use a nonexistent syscall, syscall + * implementations should refrain from returning -ENOSYS. + */ +#define ENOSYS 38 /* Invalid system call number */ + #define ENOTEMPTY 39 /* Directory not empty */ #define ELOOP 40 /* Too many symbolic links encountered */ #define EWOULDBLOCK EAGAIN /* Operation would block */ -- cgit v1.2.3 From 5281f94ae7f54d2d1a48dbc10223fd12d2aea716 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 16 Apr 2015 12:45:45 -0700 Subject: drivers/rtc/rtc-s5m.c: add support for S2MPS13 RTC The S2MPS13 RTC is almost the same as S2MPS14. The differences when updating alarm are: 1. Set WUDR+AUDR field instead of WUDR+RUDR. 2. Clear the AUDR field later (it is not auto-cleared). Signed-off-by: Krzysztof Kozlowski Cc: Alexandre Belloni Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-s5m.c | 20 +++++++++++++++++++- include/linux/mfd/samsung/rtc.h | 2 ++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c index 4008b84246ca..f0cb0ecb3d8d 100644 --- a/drivers/rtc/rtc-s5m.c +++ b/drivers/rtc/rtc-s5m.c @@ -187,6 +187,7 @@ static inline int s5m_check_peding_alarm_interrupt(struct s5m_rtc_info *info, val &= S5M_ALARM0_STATUS; break; case S2MPS14X: + case S2MPS13X: ret = regmap_read(info->s5m87xx->regmap_pmic, S2MPS14_REG_ST2, &val); val &= S2MPS_ALARM0_STATUS; @@ -252,6 +253,9 @@ static inline int s5m8767_rtc_set_alarm_reg(struct s5m_rtc_info *info) case S2MPS14X: data |= S2MPS_RTC_RUDR_MASK; break; + case S2MPS13X: + data |= S2MPS13_RTC_AUDR_MASK; + break; default: return -EINVAL; } @@ -265,6 +269,11 @@ static inline int s5m8767_rtc_set_alarm_reg(struct s5m_rtc_info *info) ret = s5m8767_wait_for_udr_update(info); + /* On S2MPS13 the AUDR is not auto-cleared */ + if (info->device_type == S2MPS13X) + regmap_update_bits(info->regmap, info->regs->rtc_udr_update, + S2MPS13_RTC_AUDR_MASK, 0); + return ret; } @@ -306,7 +315,7 @@ static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm) u8 data[info->regs->regs_count]; int ret; - if (info->device_type == S2MPS14X) { + if (info->device_type == S2MPS14X || info->device_type == S2MPS13X) { ret = regmap_update_bits(info->regmap, info->regs->rtc_udr_update, S2MPS_RTC_RUDR_MASK, S2MPS_RTC_RUDR_MASK); @@ -329,6 +338,7 @@ static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm) case S5M8767X: case S2MPS14X: + case S2MPS13X: s5m8767_data_to_tm(data, tm, info->rtc_24hr_mode); break; @@ -355,6 +365,7 @@ static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm) break; case S5M8767X: case S2MPS14X: + case S2MPS13X: ret = s5m8767_tm_to_data(tm, data); break; default: @@ -402,6 +413,7 @@ static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) case S5M8767X: case S2MPS14X: + case S2MPS13X: s5m8767_data_to_tm(data, &alrm->time, info->rtc_24hr_mode); alrm->enabled = 0; for (i = 0; i < info->regs->regs_count; i++) { @@ -450,6 +462,7 @@ static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info) case S5M8767X: case S2MPS14X: + case S2MPS13X: for (i = 0; i < info->regs->regs_count; i++) data[i] &= ~ALARM_ENABLE_MASK; @@ -494,6 +507,7 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info) case S5M8767X: case S2MPS14X: + case S2MPS13X: data[RTC_SEC] |= ALARM_ENABLE_MASK; data[RTC_MIN] |= ALARM_ENABLE_MASK; data[RTC_HOUR] |= ALARM_ENABLE_MASK; @@ -533,6 +547,7 @@ static int s5m_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) case S5M8767X: case S2MPS14X: + case S2MPS13X: s5m8767_tm_to_data(&alrm->time, data); break; @@ -615,6 +630,7 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info) break; case S2MPS14X: + case S2MPS13X: data[0] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT); ret = regmap_write(info->regmap, info->regs->ctrl, data[0]); break; @@ -652,6 +668,7 @@ static int s5m_rtc_probe(struct platform_device *pdev) switch (pdata->device_type) { case S2MPS14X: + case S2MPS13X: regmap_cfg = &s2mps14_rtc_regmap_config; info->regs = &s2mps_rtc_regs; alarm_irq = S2MPS14_IRQ_RTCA0; @@ -772,6 +789,7 @@ static SIMPLE_DEV_PM_OPS(s5m_rtc_pm_ops, s5m_rtc_suspend, s5m_rtc_resume); static const struct platform_device_id s5m_rtc_id[] = { { "s5m-rtc", S5M8767X }, + { "s2mps13-rtc", S2MPS13X }, { "s2mps14-rtc", S2MPS14X }, { }, }; diff --git a/include/linux/mfd/samsung/rtc.h b/include/linux/mfd/samsung/rtc.h index b6401e7661c7..29c30ac36020 100644 --- a/include/linux/mfd/samsung/rtc.h +++ b/include/linux/mfd/samsung/rtc.h @@ -105,6 +105,8 @@ enum s2mps_rtc_reg { #define S5M_RTC_UDR_MASK (1 << S5M_RTC_UDR_SHIFT) #define S2MPS_RTC_WUDR_SHIFT 4 #define S2MPS_RTC_WUDR_MASK (1 << S2MPS_RTC_WUDR_SHIFT) +#define S2MPS13_RTC_AUDR_SHIFT 1 +#define S2MPS13_RTC_AUDR_MASK (1 << S2MPS13_RTC_AUDR_SHIFT) #define S2MPS_RTC_RUDR_SHIFT 0 #define S2MPS_RTC_RUDR_MASK (1 << S2MPS_RTC_RUDR_SHIFT) #define RTC_TCON_SHIFT 1 -- cgit v1.2.3 From 16db3d3f1170fb0efca652c9378ce7c5f5cb4232 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Apr 2015 12:47:50 -0700 Subject: kernel/sysctl.c: threads-max observe limits Users can change the maximum number of threads by writing to /proc/sys/kernel/threads-max. With the patch the value entered is checked against the same limits that apply when fork_init is called. Signed-off-by: Heinrich Schuchardt Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 3 +++ kernel/fork.c | 31 +++++++++++++++++++++++++++++-- kernel/sysctl.c | 6 ++---- 3 files changed, 34 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b7361f831226..795d5fea5697 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -212,4 +212,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, #endif /* CONFIG_SYSCTL */ +int sysctl_max_threads(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #endif /* _LINUX_SYSCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index c7f2e1a4187a..8807a129711b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -266,7 +267,7 @@ void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ -static void set_max_threads(void) +static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; @@ -280,6 +281,9 @@ static void set_max_threads(void) threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); + if (threads > max_threads_suggested) + threads = max_threads_suggested; + max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } @@ -298,7 +302,7 @@ void __init fork_init(void) /* do the arch specific task caches init */ arch_task_cache_init(); - set_max_threads(); + set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; @@ -2020,3 +2024,26 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +int sysctl_max_threads(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int ret; + int threads = max_threads; + int min = MIN_THREADS; + int max = MAX_THREADS; + + t = *table; + t.data = &threads; + t.extra1 = &min; + t.extra2 = &max; + + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + set_max_threads(threads); + + return 0; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 42b7fc2860c1..3c0998426b57 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -93,11 +93,9 @@ #include #endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -710,10 +708,10 @@ static struct ctl_table kern_table[] = { #endif { .procname = "threads-max", - .data = &max_threads, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_max_threads, }, { .procname = "random", -- cgit v1.2.3 From 90f31d0ea88880f780574f3d0bb1a227c4c66ca3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Apr 2015 12:47:56 -0700 Subject: mm: rcu-protected get_mm_exe_file() This patch removes mm->mmap_sem from mm->exe_file read side. Also it kills dup_mm_exe_file() and moves exe_file duplication into dup_mmap() where both mmap_sems are locked. [akpm@linux-foundation.org: fix comment typo] Signed-off-by: Konstantin Khlebnikov Cc: Davidlohr Bueso Cc: Al Viro Cc: Oleg Nesterov Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 3 +-- include/linux/fs.h | 1 + include/linux/mm_types.h | 2 +- kernel/fork.c | 56 ++++++++++++++++++++++++++++++++---------------- 4 files changed, 40 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/fs/file.c b/fs/file.c index ee738ea028fa..93c5f89c248b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -638,8 +638,7 @@ static struct file *__fget(unsigned int fd, fmode_t mask) file = fcheck_files(files, fd); if (file) { /* File object ref couldn't be taken */ - if ((file->f_mode & mask) || - !atomic_long_inc_not_zero(&file->f_count)) + if ((file->f_mode & mask) || !get_file_rcu(file)) file = NULL; } rcu_read_unlock(); diff --git a/include/linux/fs.h b/include/linux/fs.h index f4fc60727b8d..6bf7ab7c1573 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -870,6 +870,7 @@ static inline struct file *get_file(struct file *f) atomic_long_inc(&f->f_count); return f; } +#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) #define file_count(x) atomic_long_read(&(x)->f_count) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 590630eb59ba..8d37e26a1007 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -429,7 +429,7 @@ struct mm_struct { #endif /* store ref to file /proc//exe symlink points to */ - struct file *exe_file; + struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 8807a129711b..259202637531 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -403,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + mm->total_vm = oldmm->total_vm; mm->shared_vm = oldmm->shared_vm; mm->exec_vm = oldmm->exec_vm; @@ -528,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -#define dup_mmap(mm, oldmm) (0) +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -697,35 +706,46 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/** + * set_mm_exe_file - change a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). + * + * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE). + * Callers prevent concurrent invocations: in mmput() nobody alive left, + * in execve task is single-threaded, prctl holds mmap_sem exclusively. + */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { + struct file *old_exe_file = rcu_dereference_protected(mm->exe_file, + !atomic_read(&mm->mm_users) || current->in_execve || + lockdep_is_held(&mm->mmap_sem)); + if (new_exe_file) get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; + rcu_assign_pointer(mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); } +/** + * get_mm_exe_file - acquire a reference to the mm's executable file + * + * Returns %NULL if mm has no associated executable file. + * User must release file via fput(). + */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of exe_file */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); + rcu_read_lock(); + exe_file = rcu_dereference(mm->exe_file); + if (exe_file && !get_file_rcu(exe_file)) + exe_file = NULL; + rcu_read_unlock(); return exe_file; } -static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - /** * get_task_mm - acquire a reference to the task's mm * @@ -887,8 +907,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; - dup_mm_exe_file(oldmm, mm); - err = dup_mmap(mm, oldmm); if (err) goto free_pt; -- cgit v1.2.3 From 02d699f1f464410d5753a034dd38c76a1a1647e0 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 16 Apr 2015 12:48:38 -0700 Subject: include/linux/kconfig.h: ese macros which are already defined It is better to use macros which are already available because then there is just one location which needs to be change. [akpm@linux-foundation.org: move IS_ENABLED definition to after the IS_BUILTIN and IS_MODULE definitions] Signed-off-by: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kconfig.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index be342b94c640..63ca8dacec59 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -22,14 +22,6 @@ #define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0) #define ___config_enabled(__ignored, val, ...) val -/* - * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', - * 0 otherwise. - * - */ -#define IS_ENABLED(option) \ - (config_enabled(option) || config_enabled(option##_MODULE)) - /* * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 * otherwise. For boolean options, this is equivalent to @@ -43,4 +35,11 @@ */ #define IS_MODULE(option) config_enabled(option##_MODULE) +/* + * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', + * 0 otherwise. + */ +#define IS_ENABLED(option) \ + (IS_BUILTIN(option) || IS_MODULE(option)) + #endif /* __LINUX_KCONFIG_H */ -- cgit v1.2.3 From ddaa27ee627fb58d1a68eb1eb196393d64c25d1b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 16 Apr 2015 12:48:44 -0700 Subject: seccomp: allow COMPAT sigreturn overrides Most architectures don't need to do much special for the strict-mode seccomp syscall entries. Remove the redundant headers and reduce the others. This patch (of 8): Some architectures may need to override the compat sigreturn definition, as is already possible in the non-compat case. Signed-off-by: Kees Cook Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Russell King Cc: Michal Simek Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "David S. Miller" Cc: Arnd Bergmann Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Will Deacon Cc: Daniel Borkmann Cc: Laura Abbott Cc: James Hogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/seccomp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h index 9fa1f653ed3b..c9ccafa0d99a 100644 --- a/include/asm-generic/seccomp.h +++ b/include/asm-generic/seccomp.h @@ -17,7 +17,9 @@ #define __NR_seccomp_read_32 __NR_read #define __NR_seccomp_write_32 __NR_write #define __NR_seccomp_exit_32 __NR_exit +#ifndef __NR_seccomp_sigreturn_32 #define __NR_seccomp_sigreturn_32 __NR_rt_sigreturn +#endif #endif /* CONFIG_COMPAT && ! already defined */ #define __NR_seccomp_read __NR_read -- cgit v1.2.3 From 6c8c90319c0bb1c9e0b68e721359b89ae4f28465 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Thu, 16 Apr 2015 12:49:38 -0700 Subject: proc: show locks in /proc/pid/fdinfo/X Let's show locks which are associated with a file descriptor in its fdinfo file. Currently we don't have a reliable way to determine who holds a lock. We can find some information in /proc/locks, but PID which is reported there can be wrong. For example, a process takes a lock, then forks a child and dies. In this case /proc/locks contains the parent pid, which can be reused by another process. $ cat /proc/locks ... 6: FLOCK ADVISORY WRITE 324 00:13:13431 0 EOF ... $ ps -C rpcbind PID TTY TIME CMD 332 ? 00:00:00 rpcbind $ cat /proc/332/fdinfo/4 pos: 0 flags: 0100000 mnt_id: 22 lock: 1: FLOCK ADVISORY WRITE 324 00:13:13431 0 EOF $ ls -l /proc/332/fd/4 lr-x------ 1 root root 64 Mar 5 14:43 /proc/332/fd/4 -> /run/rpcbind.lock $ ls -l /proc/324/fd/ total 0 lrwx------ 1 root root 64 Feb 27 14:50 0 -> /dev/pts/0 lrwx------ 1 root root 64 Feb 27 14:50 1 -> /dev/pts/0 lrwx------ 1 root root 64 Feb 27 14:49 2 -> /dev/pts/0 You can see that the process with the 324 pid doesn't hold the lock. This information is required for proper dumping and restoring file locks. Signed-off-by: Andrey Vagin Cc: Jonathan Corbet Cc: Alexander Viro Acked-by: Jeff Layton Acked-by: "J. Bruce Fields" Acked-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++++ fs/locks.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/proc/fd.c | 27 +++++++++++++++++---------- include/linux/fs.h | 7 +++++++ 4 files changed, 66 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 6d370622fdfe..8e36c7e3c345 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1709,6 +1709,10 @@ A typical output is flags: 0100002 mnt_id: 19 +All locks associated with a file descriptor are shown in its fdinfo too. + +lock: 1: FLOCK ADVISORY WRITE 359 00:13:11691 0 EOF + The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags pair provide additional information particular to the objects they represent. diff --git a/fs/locks.c b/fs/locks.c index 52b780fb5258..653faabb07f4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2590,6 +2590,44 @@ static int locks_show(struct seq_file *f, void *v) return 0; } +static void __show_fd_locks(struct seq_file *f, + struct list_head *head, int *id, + struct file *filp, struct files_struct *files) +{ + struct file_lock *fl; + + list_for_each_entry(fl, head, fl_list) { + + if (filp != fl->fl_file) + continue; + if (fl->fl_owner != files && + fl->fl_owner != filp) + continue; + + (*id)++; + seq_puts(f, "lock:\t"); + lock_get_status(f, fl, *id, ""); + } +} + +void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files) +{ + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx; + int id = 0; + + ctx = inode->i_flctx; + if (!ctx) + return; + + spin_lock(&ctx->flc_lock); + __show_fd_locks(f, &ctx->flc_flock, &id, filp, files); + __show_fd_locks(f, &ctx->flc_posix, &id, filp, files); + __show_fd_locks(f, &ctx->flc_lease, &id, filp, files); + spin_unlock(&ctx->flc_lock); +} + static void *locks_start(struct seq_file *f, loff_t *pos) __acquires(&blocked_lock_lock) { diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 8e5ad83b629a..af84ad04df77 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -48,17 +49,23 @@ static int seq_show(struct seq_file *m, void *v) put_files_struct(files); } - if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", - (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id); - if (file->f_op->show_fdinfo) - file->f_op->show_fdinfo(m, file); - ret = seq_has_overflowed(m); - fput(file); - } + if (ret) + return ret; - return ret; + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); + + show_fd_locks(m, file, files); + if (seq_has_overflowed(m)) + goto out; + + if (file->f_op->show_fdinfo) + file->f_op->show_fdinfo(m, file); + +out: + fput(file); + return 0; } static int seq_fdinfo_open(struct inode *inode, struct file *file) diff --git a/include/linux/fs.h b/include/linux/fs.h index 6bf7ab7c1573..c4e927358503 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1042,6 +1042,9 @@ extern void lease_get_mtime(struct inode *, struct timespec *time); extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); extern int vfs_setlease(struct file *, long, struct file_lock **, void **); extern int lease_modify(struct file_lock *, int, struct list_head *); +struct files_struct; +extern void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1178,6 +1181,10 @@ static inline int lease_modify(struct file_lock *fl, int arg, { return -EINVAL; } + +struct files_struct; +static inline void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files) {} #endif /* !CONFIG_FILE_LOCKING */ -- cgit v1.2.3