From 8c6e50b0290c4c708a3e6462729e1e9151a9a7df Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:18 -0700 Subject: mm: introduce vm_ops->map_pages() Here's new version of faultaround patchset. It took a while to tune it and collect performance data. First patch adds new callback ->map_pages to vm_operations_struct. ->map_pages() is called when VM asks to map easy accessible pages. Filesystem should find and map pages associated with offsets from "pgoff" till "max_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with offset "pgoff" is passed in "pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". Currently VM use ->map_pages only on read page fault path. We try to map FAULT_AROUND_PAGES a time. FAULT_AROUND_PAGES is 16 for now. Performance data for different FAULT_AROUND_ORDER is below. TODO: - implement ->map_pages() for shmem/tmpfs; - modify get_user_pages() to be able to use ->map_pages() and implement mmap(MAP_POPULATE|MAP_NONBLOCK) on top. ========================================================================= Tested on 4-socket machine (120 threads) with 128GiB of RAM. Few real-world workloads. The sweet spot for FAULT_AROUND_ORDER here is somewhere between 3 and 5. Let's say 4 :) Linux build (make -j60) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 283,301,572 247,151,987 212,215,789 204,772,882 199,568,944 194,703,779 193,381,485 time, seconds 151.227629483 153.920996480 151.356125472 150.863792049 150.879207877 151.150764954 151.450962358 Linux rebuild (make -j60) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 5,396,854 4,148,444 2,855,286 2,577,282 2,361,957 2,169,573 2,112,643 time, seconds 27.404543757 27.559725591 27.030057426 26.855045126 26.678618635 26.974523490 26.761320095 Git test suite (make -j60 test) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 129,591,823 99,200,751 66,106,718 57,606,410 51,510,808 45,776,813 44,085,515 time, seconds 66.087215026 64.784546905 64.401156567 65.282708668 66.034016829 66.793780811 67.237810413 Two synthetic tests: access every word in file in sequential/random order. It doesn't improve much after FAULT_AROUND_ORDER == 4. Sequential access 16GiB file FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 1 thread minor-faults 4,195,437 2,098,275 525,068 262,251 131,170 32,856 8,282 time, seconds 7.250461742 6.461711074 5.493859139 5.488488147 5.707213983 5.898510832 5.109232856 8 threads minor-faults 33,557,540 16,892,728 4,515,848 2,366,999 1,423,382 442,732 142,339 time, seconds 16.649304881 9.312555263 6.612490639 6.394316732 6.669827501 6.75078944 6.371900528 32 threads minor-faults 134,228,222 67,526,810 17,725,386 9,716,537 4,763,731 1,668,921 537,200 time, seconds 49.164430543 29.712060103 12.938649729 10.175151004 11.840094583 9.594081325 9.928461797 60 threads minor-faults 251,687,988 126,146,952 32,919,406 18,208,804 10,458,947 2,733,907 928,217 time, seconds 86.260656897 49.626551828 22.335007632 17.608243696 16.523119035 16.339489186 16.326390902 120 threads minor-faults 503,352,863 252,939,677 67,039,168 35,191,827 19,170,091 4,688,357 1,471,862 time, seconds 124.589206333 79.757867787 39.508707872 32.167281632 29.972989292 28.729834575 28.042251622 Random access 1GiB file 1 thread minor-faults 262,636 132,743 34,369 17,299 8,527 3,451 1,222 time, seconds 15.351890914 16.613802482 16.569227308 15.179220992 16.557356122 16.578247824 15.365266994 8 threads minor-faults 2,098,948 1,061,871 273,690 154,501 87,110 25,663 7,384 time, seconds 15.040026343 15.096933500 14.474757288 14.289129964 14.411537468 14.296316837 14.395635804 32 threads minor-faults 8,390,734 4,231,023 1,054,432 528,847 269,242 97,746 26,881 time, seconds 20.430433109 21.585235358 22.115062928 14.872878951 14.880856305 14.883370649 14.821261690 60 threads minor-faults 15,733,258 7,892,809 1,973,393 988,266 594,789 164,994 51,691 time, seconds 26.577302548 25.692397770 18.728863715 20.153026398 21.619101933 17.745086260 17.613215273 120 threads minor-faults 31,471,111 15,816,616 3,959,209 1,978,685 1,008,299 264,635 96,010 time, seconds 41.835322703 40.459786095 36.085306105 35.313894834 35.814445675 36.552633793 34.289210594 Touch only one page in page table in 16GiB file FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 1 thread minor-faults 8,372 8,324 8,270 8,260 8,249 8,239 8,237 time, seconds 0.039892712 0.045369149 0.051846126 0.063681685 0.079095975 0.17652406 0.541213386 8 threads minor-faults 65,731 65,681 65,628 65,620 65,608 65,599 65,596 time, seconds 0.124159196 0.488600638 0.156854426 0.191901957 0.242631486 0.543569456 1.677303984 32 threads minor-faults 262,388 262,341 262,285 262,276 262,266 262,257 263,183 time, seconds 0.452421421 0.488600638 0.565020946 0.648229739 0.789850823 1.651584361 5.000361559 60 threads minor-faults 491,822 491,792 491,723 491,711 491,701 491,691 491,825 time, seconds 0.763288616 0.869620515 0.980727360 1.161732354 1.466915814 3.04041448 9.308612938 120 threads minor-faults 983,466 983,655 983,366 983,372 983,363 984,083 984,164 time, seconds 1.595846553 1.667902182 2.008959376 2.425380942 2.941368804 5.977807890 18.401846125 This patch (of 2): Introduce new vm_ops callback ->map_pages() and uses it for mapping easy accessible pages around fault address. On read page fault, if filesystem provides ->map_pages(), we try to map up to FAULT_AROUND_PAGES pages around page fault address in hope to reduce number of minor page faults. We call ->map_pages first and use ->fault() as fallback if page by the offset is not ready to be mapped (cold page cache or something). Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f424e0e5b46b..efca5c1bbb10 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -529,6 +529,7 @@ locking rules: open: yes close: yes fault: yes can return with page locked +map_pages: yes page_mkwrite: yes can return with page locked access: yes @@ -540,6 +541,15 @@ the page, then ensure it is not already truncated (the page lock will block subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. + ->map_pages() is called when VM asks to map easy accessible pages. +Filesystem should find and map pages associated with offsets from "pgoff" +till "max_pgoff". ->map_pages() is called with page table locked and must +not block. If it's not possible to reach a page without blocking, +filesystem should skip it. Filesystem should use do_set_pte() to setup +page table entry. Pointer to entry associated with offset "pgoff" is +passed in "pte" field in vm_fault structure. Pointers to entries for other +offsets should be calculated relative to "pte". + ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are no truncate/invalidate races, and then return with the page locked. If -- cgit v1.2.3 From 539a13b47e462d28c48f076c63871580f694a366 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:32 -0700 Subject: res_counter: remove interface for locked charging and uncharging The res_counter_{charge,uncharge}_locked() variants are not used in the kernel outside of the resource counter code itself, so remove the interface. Signed-off-by: David Rientjes Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/resource_counter.txt | 12 ++---------- include/linux/res_counter.h | 6 +----- kernel/res_counter.c | 23 ++++++++++++----------- 3 files changed, 15 insertions(+), 26 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index 5108afb3645c..762ca54eb929 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt @@ -76,15 +76,7 @@ to work with it. limit_fail_at parameter is set to the particular res_counter element where the charging failed. - d. int res_counter_charge_locked - (struct res_counter *rc, unsigned long val, bool force) - - The same as res_counter_charge(), but it must not acquire/release the - res_counter->lock internally (it must be called with res_counter->lock - held). The force parameter indicates whether we can bypass the limit. - - e. u64 res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) + d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val) When a resource is released (freed) it should be de-accounted from the resource counter it was accounted to. This is called @@ -93,7 +85,7 @@ to work with it. The _locked routines imply that the res_counter->lock is taken. - f. u64 res_counter_uncharge_until + e. u64 res_counter_uncharge_until (struct res_counter *rc, struct res_counter *top, unsigned long val) diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 201a69749659..56b7bc32db4f 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -104,15 +104,13 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); * units, e.g. numbers, bytes, Kbytes, etc * * returns 0 on success and <0 if the counter->usage will exceed the - * counter->limit _locked call expects the counter->lock to be taken + * counter->limit * * charge_nofail works the same, except that it charges the resource * counter unconditionally, and returns < 0 if the after the current * charge we are over limit. */ -int __must_check res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force); int __must_check res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); int res_counter_charge_nofail(struct res_counter *counter, @@ -125,12 +123,10 @@ int res_counter_charge_nofail(struct res_counter *counter, * @val: the amount of the resource * * these calls check for usage underflow and show a warning on the console - * _locked call expects the counter->lock to be taken * * returns the total charges still present in @counter. */ -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); u64 res_counter_uncharge_until(struct res_counter *counter, diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 4aa8a305aede..51dbac6a3633 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) -- cgit v1.2.3 From d715ae08f2ff87508a081c4df78061bf4f7211d6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:46 -0700 Subject: memcg: rename high level charging functions mem_cgroup_newpage_charge is used only for charging anonymous memory so it is better to rename it to mem_cgroup_charge_anon. mem_cgroup_cache_charge is used for file backed memory so rename it to mem_cgroup_charge_file. Signed-off-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memcg_test.txt | 4 ++-- include/linux/memcontrol.h | 8 ++++---- mm/filemap.c | 2 +- mm/huge_memory.c | 8 ++++---- mm/memcontrol.c | 4 ++-- mm/memory.c | 6 +++--- mm/shmem.c | 6 +++--- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index ce94a83a7d9a..80ac454704b8 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -24,7 +24,7 @@ Please note that implementation details can be changed. a page/swp_entry may be charged (usage += PAGE_SIZE) at - mem_cgroup_newpage_charge() + mem_cgroup_charge_anon() Called at new page fault and Copy-On-Write. mem_cgroup_try_charge_swapin() @@ -32,7 +32,7 @@ Please note that implementation details can be changed. Followed by charge-commit-cancel protocol. (With swap accounting) At commit, a charge recorded in swap_cgroup is removed. - mem_cgroup_cache_charge() + mem_cgroup_charge_file() Called at add_to_page_cache() mem_cgroup_cache_charge_swapin() diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 134636f835f7..96f3fc87ab96 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie { * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) */ -extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, +extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, @@ -74,7 +74,7 @@ extern void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg); extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); -extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); @@ -233,13 +233,13 @@ void mem_cgroup_print_bad_page(struct page *page); #else /* CONFIG_MEMCG */ struct mem_cgroup; -static inline int mem_cgroup_newpage_charge(struct page *page, +static inline int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; } -static inline int mem_cgroup_cache_charge(struct page *page, +static inline int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; diff --git a/mm/filemap.c b/mm/filemap.c index b952d99c827c..27ebc0c9571b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -563,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) return error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f4981418fc..64635f5278ff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, + mem_cgroup_charge_anon(pages[i], mm, GFP_KERNEL))) { if (pages[i]) put_page(pages[i]); @@ -1101,7 +1101,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2359,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) return; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 038b037f8d67..e33b1d09eb1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3818,7 +3818,7 @@ out: return ret; } -int mem_cgroup_newpage_charge(struct page *page, +int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { unsigned int nr_pages = 1; @@ -3954,7 +3954,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, MEM_CGROUP_CHARGE_TYPE_ANON); } -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; diff --git a/mm/memory.c b/mm/memory.c index 1b88da5c08b3..854e4027719f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2828,7 +2828,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -3281,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3537,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { page_cache_release(new_page); return VM_FAULT_OOM; } diff --git a/mm/shmem.c b/mm/shmem.c index 70709347a1e2..70273f8df586 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1080,7 +1080,7 @@ repeat: goto failed; } - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1134,7 +1134,7 @@ repeat: SetPageSwapBacked(page); __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) goto decused; -- cgit v1.2.3 From e64cd51d2fa87733176246101df871a8ac5c7c20 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:07 -0700 Subject: zram: move zram size warning to documentation Move zram warning about disksize and size of memory correlation to zram documentation. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 5 +++++ drivers/block/zram/zram_drv.c | 15 --------------- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'Documentation') diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 2eccddffa6c8..393541be1ec0 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -33,6 +33,11 @@ Following shows a typical sequence of steps for using zram. echo 512M > /sys/block/zram0/disksize echo 1G > /sys/block/zram0/disksize +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + 3) Activate: mkswap /dev/zram0 swapon /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c06d7fba4237..21aee3edcb25 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -535,23 +535,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) static void zram_init_device(struct zram *zram, struct zram_meta *meta) { - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; pr_debug("Initialization done!\n"); } -- cgit v1.2.3 From 8dd1d3247e6c00b50ef83934ea8b22a1590015de Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:08 -0700 Subject: zram: document failed_reads, failed_writes stats Document `failed_reads' and `failed_writes' device attributes. Remove info about `discard' - there is no such zram attr. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 24 +++++++++++++++--------- Documentation/blockdev/zram.txt | 3 ++- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 3f0b9ae61d8c..8aa046841625 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -43,6 +43,21 @@ Description: The invalid_io file is read-only and specifies the number of non-page-size-aligned I/O requests issued to this device. +What: /sys/block/zram/failed_reads +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The failed_reads file is read-only and specifies the number of + failed reads happened on this device. + + +What: /sys/block/zram/failed_writes +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The failed_writes file is read-only and specifies the number of + failed writes happened on this device. + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta @@ -53,15 +68,6 @@ Description: is freed. This statistic is applicable only when this disk is being used as a swap disk. -What: /sys/block/zram/discard -Date: August 2010 -Contact: Nitin Gupta -Description: - The discard file is read-only and specifies the number of - discard requests received by this device. These requests - provide information to block device regarding blocks which are - no longer used by filesystem. - What: /sys/block/zram/zero_pages Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 393541be1ec0..b31ac5e5d4b9 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -51,9 +51,10 @@ size of the disk when not in use so a huge zram is wasteful. disksize num_reads num_writes + failed_reads + failed_writes invalid_io notify_free - discard zero_pages orig_data_size compr_data_size -- cgit v1.2.3 From beca3ec71fe5490ee9237dc42400f50402baf83e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:14 -0700 Subject: zram: add multi stream functionality Existing zram (zcomp) implementation has only one compression stream (buffer and algorithm private part), so in order to prevent data corruption only one write (compress operation) can use this compression stream, forcing all concurrent write operations to wait for stream lock to be released. This patch changes zcomp to keep a compression streams list of user-defined size (via sysfs device attr). Each write operation still exclusively holds compression stream, the difference is that we can have N write operations (depending on size of streams list) executing in parallel. See TEST section later in commit message for performance data. Introduce struct zcomp_strm_multi and a set of functions to manage zcomp_strm stream access. zcomp_strm_multi has a list of idle zcomp_strm structs, spinlock to protect idle list and wait queue, making it possible to perform parallel compressions. The following set of functions added: - zcomp_strm_multi_find()/zcomp_strm_multi_release() find and release a compression stream, implement required locking - zcomp_strm_multi_create()/zcomp_strm_multi_destroy() create and destroy zcomp_strm_multi zcomp ->strm_find() and ->strm_release() callbacks are set during initialisation to zcomp_strm_multi_find()/zcomp_strm_multi_release() correspondingly. Each time zcomp issues a zcomp_strm_multi_find() call, the following set of operations performed: - spin lock strm_lock - if idle list is not empty, remove zcomp_strm from idle list, spin unlock and return zcomp stream pointer to caller - if idle list is empty, current adds itself to wait queue. it will be awaken by zcomp_strm_multi_release() caller. zcomp_strm_multi_release(): - spin lock strm_lock - add zcomp stream to idle list - spin unlock, wake up sleeper Minchan Kim reported that spinlock-based locking scheme has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based (see https://lkml.org/lkml/2014/2/18/16) base spinlock mutex ==Initial write ==Initial write ==Initial write records: 5 records: 5 records: 5 avg: 1642424.35 avg: 699610.40 avg: 1655583.71 std: 39890.95(2.43%) std: 232014.19(33.16%) std: 52293.96 max: 1690170.94 max: 1163473.45 max: 1697164.75 min: 1568669.52 min: 573429.88 min: 1553410.23 ==Rewrite ==Rewrite ==Rewrite records: 5 records: 5 records: 5 avg: 1611775.39 avg: 501406.64 avg: 1684419.11 std: 17144.58(1.06%) std: 15354.41(3.06%) std: 18367.42 max: 1641800.95 max: 531356.78 max: 1706445.84 min: 1593515.27 min: 488817.78 min: 1655335.73 When only one compression stream available, mutex with spin on owner tends to perform much better than frequent wait_event()/wake_up(). This is why single stream implemented as a special case with mutex locking. Introduce and document zram device attribute max_comp_streams. This attr shows and stores current zcomp's max number of zcomp streams (max_strm). Extend zcomp's zcomp_create() with `max_strm' parameter. `max_strm' limits the number of zcomp_strm structs in compression backend's idle list (max_comp_streams). max_comp_streams used during initialisation as follows: -- passing to zcomp_create() max_strm equals to 1 will initialise zcomp using single compression stream zcomp_strm_single (mutex-based locking). -- passing to zcomp_create() max_strm greater than 1 will initialise zcomp using multi compression stream zcomp_strm_multi (spinlock-based locking). default max_comp_streams value is 1, meaning that zram with single stream will be initialised. Later patch will introduce configuration knob to change max_comp_streams on already initialised and used zcomp. TEST iozone -t 3 -R -r 16K -s 60M -I +Z test base 1 strm (mutex) 3 strm (spinlock) ----------------------------------------------------------------------- Initial write 589286.78 583518.39 718011.05 Rewrite 604837.97 596776.38 1515125.72 Random write 584120.11 595714.58 1388850.25 Pwrite 535731.17 541117.38 739295.27 Fwrite 1418083.88 1478612.72 1484927.06 Usage example: set max_comp_streams to 4 echo 4 > /sys/block/zram0/max_comp_streams show current max_comp_streams (default value is 1). cat /sys/block/zram0/max_comp_streams Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 9 ++- Documentation/blockdev/zram.txt | 31 ++++++-- drivers/block/zram/zcomp.c | 124 ++++++++++++++++++++++++++++- drivers/block/zram/zcomp.h | 4 +- drivers/block/zram/zram_drv.c | 42 +++++++++- drivers/block/zram/zram_drv.h | 2 +- 6 files changed, 201 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 8aa046841625..0da9ed6b82ea 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -50,7 +50,6 @@ Description: The failed_reads file is read-only and specifies the number of failed reads happened on this device. - What: /sys/block/zram/failed_writes Date: February 2014 Contact: Sergey Senozhatsky @@ -58,6 +57,14 @@ Description: The failed_writes file is read-only and specifies the number of failed writes happened on this device. +What: /sys/block/zram/max_comp_streams +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The max_comp_streams file is read-write and specifies the + number of backend's zcomp_strm compression streams (number of + concurrent compress operations). + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index b31ac5e5d4b9..aadfe60391b7 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -21,7 +21,28 @@ Following shows a typical sequence of steps for using zram. This creates 4 devices: /dev/zram{0,1,2,3} (num_devices parameter is optional. Default: 1) -2) Set Disksize +2) Set max number of compression streams + Compression backend may use up to max_comp_streams compression streams, + thus allowing up to max_comp_streams concurrent compression operations. + By default, compression backend uses single compression stream. + + Examples: + #show max compression streams number + cat /sys/block/zram0/max_comp_streams + + #set max compression streams number to 3 + echo 3 > /sys/block/zram0/max_comp_streams + +Note: +In order to enable compression backend's multi stream support max_comp_streams +must be initially set to desired concurrency level before ZRAM device +initialisation. Once the device initialised as a single stream compression +backend (max_comp_streams equals to 0) changing the value of max_comp_streams +will not take any effect, because single stream compression backend implemented +as a special case and does not support dynamic max_comp_streams. Only multi +stream backend supports dynamic max_comp_streams adjustment. + +3) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. Examples: @@ -38,14 +59,14 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -3) Activate: +4) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -4) Stats: +5) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -60,11 +81,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -5) Deactivate: +6) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -6) Reset: +7) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 72e8071f9d73..c06f75f54718 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -24,6 +24,21 @@ struct zcomp_strm_single { struct zcomp_strm *zstrm; }; +/* + * multi zcomp_strm backend + */ +struct zcomp_strm_multi { + /* protect strm list */ + spinlock_t strm_lock; + /* max possible number of zstrm streams */ + int max_strm; + /* number of available zstrm streams */ + int avail_strm; + /* list of available strms */ + struct list_head idle_strm; + wait_queue_head_t strm_wait; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -62,6 +77,107 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +/* + * get idle zcomp_strm or wait until other process release + * (zcomp_strm_release()) one for us + */ +static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (1) { + spin_lock(&zs->strm_lock); + if (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + spin_unlock(&zs->strm_lock); + return zstrm; + } + /* zstrm streams limit reached, wait for idle stream */ + if (zs->avail_strm >= zs->max_strm) { + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + /* allocate new zstrm stream */ + zs->avail_strm++; + spin_unlock(&zs->strm_lock); + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + spin_lock(&zs->strm_lock); + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + break; + } + return zstrm; +} + +/* add stream back to idle list and wake up waiter or free the stream */ +static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + struct zcomp_strm_multi *zs = comp->stream; + + spin_lock(&zs->strm_lock); + if (zs->avail_strm <= zs->max_strm) { + list_add(&zstrm->list, &zs->idle_strm); + spin_unlock(&zs->strm_lock); + wake_up(&zs->strm_wait); + return; + } + + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + zcomp_strm_free(comp, zstrm); +} + +static void zcomp_strm_multi_destroy(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + } + kfree(zs); +} + +static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) +{ + struct zcomp_strm *zstrm; + struct zcomp_strm_multi *zs; + + comp->destroy = zcomp_strm_multi_destroy; + comp->strm_find = zcomp_strm_multi_find; + comp->strm_release = zcomp_strm_multi_release; + zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + spin_lock_init(&zs->strm_lock); + INIT_LIST_HEAD(&zs->idle_strm); + init_waitqueue_head(&zs->strm_wait); + zs->max_strm = max_strm; + zs->avail_strm = 1; + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + kfree(zs); + return -ENOMEM; + } + list_add(&zstrm->list, &zs->idle_strm); + return 0; +} + static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -139,7 +255,7 @@ void zcomp_destroy(struct zcomp *comp) * if requested algorithm is not supported or in case * of init error */ -struct zcomp *zcomp_create(const char *compress) +struct zcomp *zcomp_create(const char *compress, int max_strm) { struct zcomp *comp; struct zcomp_backend *backend; @@ -153,7 +269,11 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - if (zcomp_strm_single_create(comp) != 0) { + if (max_strm > 1) + zcomp_strm_multi_create(comp, max_strm); + else + zcomp_strm_single_create(comp); + if (!comp->stream) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index dc3500d842a3..2a3684446160 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -21,6 +21,8 @@ struct zcomp_strm { * working memory) */ void *private; + /* used in multi stream backend, protected by backend strm_lock */ + struct list_head list; }; /* static compression backend */ @@ -47,7 +49,7 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; -struct zcomp *zcomp_create(const char *comp); +struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 98823f9ca8b1..bdc7eb8c6df7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -108,6 +108,40 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return sprintf(buf, "%d\n", val); +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int num; + struct zram *zram = dev_to_zram(dev); + + if (kstrtoint(buf, 0, &num)) + return -EINVAL; + if (num < 1) + return -EINVAL; + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't set max_comp_streams for initialized device\n"); + return -EBUSY; + } + zram->max_comp_streams = num; + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -502,6 +536,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) } zcomp_destroy(zram->comp); + zram->max_comp_streams = 1; + zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -537,7 +573,7 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor); + zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", default_compressor); @@ -698,6 +734,8 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, + max_comp_streams_show, max_comp_streams_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -722,6 +760,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_max_comp_streams.attr, NULL, }; @@ -784,6 +823,7 @@ static int create_device(struct zram *zram, int device_id) } zram->meta = NULL; + zram->max_comp_streams = 1; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 45e04f7b713f..ccf36d11755a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -99,7 +99,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - + int max_comp_streams; struct zram_stats stats; }; #endif -- cgit v1.2.3 From e46b8a030d76d3c94156c545c3f4c3676d813435 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:17 -0700 Subject: zram: make compression algorithm selection possible Add and document `comp_algorithm' device attribute. This attribute allows to show supported compression and currently selected compression algorithms: cat /sys/block/zram0/comp_algorithm [lzo] lz4 and change selected compression algorithm: echo lzo > /sys/block/zram0/comp_algorithm Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 8 +++++++ Documentation/blockdev/zram.txt | 24 +++++++++++++++---- drivers/block/zram/zcomp.c | 32 +++++++++++++++++++++++--- drivers/block/zram/zcomp.h | 2 ++ drivers/block/zram/zram_drv.c | 37 +++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 1 + 6 files changed, 93 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 0da9ed6b82ea..70ec992514d0 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -65,6 +65,14 @@ Description: number of backend's zcomp_strm compression streams (number of concurrent compress operations). +What: /sys/block/zram/comp_algorithm +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The comp_algorithm file is read-write and lets to show + available and selected compression algorithms, change + compression algorithm selection. + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index aadfe60391b7..2604ffed51db 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -42,7 +42,21 @@ will not take any effect, because single stream compression backend implemented as a special case and does not support dynamic max_comp_streams. Only multi stream backend supports dynamic max_comp_streams adjustment. -3) Set Disksize +3) Select compression algorithm + Using comp_algorithm device attribute one can see available and + currently selected (shown in square brackets) compression algortithms, + change selected compression algorithm (once the device is initialised + there is no way to change compression algorithm). + + Examples: + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +4) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. Examples: @@ -59,14 +73,14 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -4) Activate: +5) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -5) Stats: +6) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -81,11 +95,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -6) Deactivate: +7) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -7) Reset: +8) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index ac276f79f21c..aad533a8bc55 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -39,11 +39,20 @@ struct zcomp_strm_multi { wait_queue_head_t strm_wait; }; +static struct zcomp_backend *backends[] = { + &zcomp_lzo, + NULL +}; + static struct zcomp_backend *find_backend(const char *compress) { - if (strncmp(compress, "lzo", 3) == 0) - return &zcomp_lzo; - return NULL; + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; } static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) @@ -251,6 +260,23 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + else + sz += sprintf(buf + sz, "%s ", backends[i]->name); + i++; + } + sz += sprintf(buf + sz, "\n"); + return sz; +} + int zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index bd11d59c5dd1..8b8997f8613b 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -50,6 +50,8 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; +ssize_t zcomp_available_show(const char *comp, char *buf); + struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3a5f24c341dc..15d46f2e158c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -141,6 +141,34 @@ static ssize_t max_comp_streams_store(struct device *dev, return len; } +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -572,10 +600,10 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); + zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", - default_compressor); + zram->compressor); err = -EINVAL; goto out_free_meta; } @@ -735,6 +763,8 @@ static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, max_comp_streams_show, max_comp_streams_store); +static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, + comp_algorithm_show, comp_algorithm_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -760,6 +790,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, NULL, }; @@ -820,7 +851,7 @@ static int create_device(struct zram *zram, int device_id) pr_warn("Error creating sysfs group"); goto out_free_disk; } - + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; return 0; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ccf36d11755a..7f21c145e317 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -101,5 +101,6 @@ struct zram { u64 disksize; /* bytes */ int max_comp_streams; struct zram_stats stats; + char compressor[10]; }; #endif -- cgit v1.2.3 From 60a726e33375a1096e85399cfa1327081b4c38be Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 Apr 2014 15:38:21 -0700 Subject: zram: propagate error to user When we initialized zcomp with single, we couldn't change max_comp_streams without zram reset but current interface doesn't show any error to user and even it changes max_comp_streams's value without any effect so it would make user very confusing. This patch prevents max_comp_streams's change when zcomp was initialized as single zcomp and emit the error to user(ex, echo). [akpm@linux-foundation.org: don't return with the lock held, per Sergey] [fengguang.wu@intel.com: fix coccinelle warnings] Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Acked-by: Sergey Senozhatsky Signed-off-by: Fengguang Wu Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 9 +++++---- drivers/block/zram/zcomp.c | 10 +++++----- drivers/block/zram/zcomp.h | 4 ++-- drivers/block/zram/zram_drv.c | 17 +++++++++++++---- 4 files changed, 25 insertions(+), 15 deletions(-) (limited to 'Documentation') diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 2604ffed51db..0595c3f56ccf 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -37,10 +37,11 @@ Note: In order to enable compression backend's multi stream support max_comp_streams must be initially set to desired concurrency level before ZRAM device initialisation. Once the device initialised as a single stream compression -backend (max_comp_streams equals to 0) changing the value of max_comp_streams -will not take any effect, because single stream compression backend implemented -as a special case and does not support dynamic max_comp_streams. Only multi -stream backend supports dynamic max_comp_streams adjustment. +backend (max_comp_streams equals to 1), you will see error if you try to change +the value of max_comp_streams because single stream compression backend +implemented as a special case by lock overhead issue and does not support +dynamic max_comp_streams. Only multi stream backend supports dynamic +max_comp_streams adjustment. 3) Select compression algorithm Using comp_algorithm device attribute one can see available and diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5647d8fe1dc1..b0e7592c44d8 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -153,7 +153,7 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr } /* change max_strm limit */ -static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) { struct zcomp_strm_multi *zs = comp->stream; struct zcomp_strm *zstrm; @@ -172,7 +172,7 @@ static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) zs->avail_strm--; } spin_unlock(&zs->strm_lock); - return 0; + return true; } static void zcomp_strm_multi_destroy(struct zcomp *comp) @@ -232,10 +232,10 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } -static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) { /* zcomp_strm_single support only max_comp_streams == 1 */ - return -ENOTSUPP; + return false; } static void zcomp_strm_single_destroy(struct zcomp *comp) @@ -284,7 +284,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf) return sz; } -int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 8b8997f8613b..c59d1fca72c0 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,7 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - int (*set_max_streams)(struct zcomp *comp, int num_strm); + bool (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -64,5 +64,5 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); -int zcomp_set_max_streams(struct zcomp *comp, int num_strm); +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6b462d27e7d7..80a1cfca1bf0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -127,19 +127,28 @@ static ssize_t max_comp_streams_store(struct device *dev, { int num; struct zram *zram = dev_to_zram(dev); + int ret; - if (kstrtoint(buf, 0, &num)) - return -EINVAL; + ret = kstrtoint(buf, 0, &num); + if (ret < 0) + return ret; if (num < 1) return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { - if (zcomp_set_max_streams(zram->comp, num)) + if (!zcomp_set_max_streams(zram->comp, num)) { pr_info("Cannot change max compression streams\n"); + ret = -EINVAL; + goto out; + } } + zram->max_comp_streams = num; + ret = len; +out: up_write(&zram->init_lock); - return len; + return ret; } static ssize_t comp_algorithm_show(struct device *dev, -- cgit v1.2.3 From 49d063cb353265c3af701bab215ac438ca7df36d Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 7 Apr 2014 15:38:34 -0700 Subject: proc: show mnt_id in /proc/pid/fdinfo Currently we don't have a way how to determing from which mount point file has been opened. This information is required for proper dumping and restoring file descriptos due to presence of mount namespaces. It's possible, that two file descriptors are opened using the same paths, but one fd references mount point from one namespace while the other fd -- from other namespace. $ ls -l /proc/1/fd/1 lrwx------ 1 root root 64 Mar 19 23:54 /proc/1/fd/1 -> /dev/null $ cat /proc/1/fdinfo/1 pos: 0 flags: 0100002 mnt_id: 16 $ cat /proc/1/mountinfo | grep ^16 16 32 0:4 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=1013356k,nr_inodes=253339,mode=755 Signed-off-by: Andrey Vagin Acked-by: Pavel Emelyanov Acked-by: Cyrill Gorcunov Cc: Rob Landley Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 17 ++++++++++++----- fs/proc/fd.c | 6 ++++-- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index f00bee144add..8b9cd8eb3f91 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1648,18 +1648,21 @@ pids, so one need to either stop or freeze processes being inspected if precise results are needed. -3.7 /proc//fdinfo/ - Information about opened file +3.8 /proc//fdinfo/ - Information about opened file --------------------------------------------------------------- This file provides information associated with an opened file. The regular -files have at least two fields -- 'pos' and 'flags'. The 'pos' represents -the current offset of the opened file in decimal form [see lseek(2) for -details] and 'flags' denotes the octal O_xxx mask the file has been -created with [see open(2) for details]. +files have at least three fields -- 'pos', 'flags' and mnt_id. The 'pos' +represents the current offset of the opened file in decimal form [see lseek(2) +for details], 'flags' denotes the octal O_xxx mask the file has been +created with [see open(2) for details] and 'mnt_id' represents mount ID of +the file system containing the opened file [see 3.5 /proc//mountinfo +for details]. A typical output is pos: 0 flags: 0100002 + mnt_id: 19 The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags pair provide additional information particular to the objects they represent. @@ -1668,6 +1671,7 @@ pair provide additional information particular to the objects they represent. ~~~~~~~~~~~~~ pos: 0 flags: 04002 + mnt_id: 9 eventfd-count: 5a where 'eventfd-count' is hex value of a counter. @@ -1676,6 +1680,7 @@ pair provide additional information particular to the objects they represent. ~~~~~~~~~~~~~~ pos: 0 flags: 04002 + mnt_id: 9 sigmask: 0000000000000200 where 'sigmask' is hex value of the signal mask associated @@ -1685,6 +1690,7 @@ pair provide additional information particular to the objects they represent. ~~~~~~~~~~~ pos: 0 flags: 02 + mnt_id: 9 tfd: 5 events: 1d data: ffffffffffffffff where 'tfd' is a target file descriptor number in decimal form, @@ -1718,6 +1724,7 @@ pair provide additional information particular to the objects they represent. pos: 0 flags: 02 + mnt_id: 9 fanotify flags:10 event-flags:0 fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 985ea881b5bc..0788d093f5d8 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -11,6 +11,7 @@ #include +#include "../mount.h" #include "internal.h" #include "fd.h" @@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v) } if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", - (long long)file->f_pos, f_flags); + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); if (file->f_op->show_fdinfo) ret = file->f_op->show_fdinfo(m, file); fput(file); -- cgit v1.2.3 From 2aaf308b95b24649a6dcfed89cd956e972089b2a Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Mon, 7 Apr 2014 15:38:56 -0700 Subject: rapidio: rework device hierarchy and introduce mport class of devices This patch removes an artificial RapidIO bus root device and establishes actual device hierarchy by providing reference to real parent devices. It also introduces device class for RapidIO controller devices (on-chip or an eternal bridge, known as "mport"). Existing implementation was sufficient for SoC-based platforms that have a single RapidIO controller. With introduction of devices using multiple RapidIO controllers and PCIe-to-RapidIO bridges the old scheme is very limiting or does not work at all. The implemented changes allow to properly reference platform's local RapidIO mport devices and provide device details needed for upper layers. This change to RapidIO device hierarchy does not break any known existing kernel or user space interfaces. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Li Yang Cc: Kumar Gala Cc: Andre van Herk Cc: Stef van Os Cc: Jerry Jacobs Cc: Arno Tiemersma Cc: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/sysfs.txt | 66 +++++++++++++++++++++++++++++++++++----- arch/powerpc/sysdev/fsl_rio.c | 1 + drivers/net/rionet.c | 1 + drivers/rapidio/devices/tsi721.c | 1 + drivers/rapidio/rio-driver.c | 22 +++++++++----- drivers/rapidio/rio-scan.c | 1 + drivers/rapidio/rio-sysfs.c | 40 ++++++++++++++++++++++++ drivers/rapidio/rio.c | 11 +++++++ drivers/rapidio/rio.h | 1 + include/linux/rio.h | 5 ++- 10 files changed, 133 insertions(+), 16 deletions(-) (limited to 'Documentation') diff --git a/Documentation/rapidio/sysfs.txt b/Documentation/rapidio/sysfs.txt index 271438c0617f..47ce9a5336e1 100644 --- a/Documentation/rapidio/sysfs.txt +++ b/Documentation/rapidio/sysfs.txt @@ -2,8 +2,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -1. Device Subdirectories ------------------------- +1. RapidIO Device Subdirectories +-------------------------------- For each RapidIO device, the RapidIO subsystem creates files in an individual subdirectory with the following name, /sys/bus/rapidio/devices/. @@ -25,8 +25,8 @@ seen by the enumerating host (destID = 1): NOTE: An enumerating or discovering endpoint does not create a sysfs entry for itself, this is why an endpoint with destID=1 is not shown in the list. -2. Attributes Common for All Devices ------------------------------------- +2. Attributes Common for All RapidIO Devices +-------------------------------------------- Each device subdirectory contains the following informational read-only files: @@ -52,16 +52,16 @@ This attribute is similar in behavior to the "config" attribute of PCI devices and provides an access to the RapidIO device registers using standard file read and write operations. -3. Endpoint Device Attributes ------------------------------ +3. RapidIO Endpoint Device Attributes +------------------------------------- Currently Linux RapidIO subsystem does not create any endpoint specific sysfs attributes. It is possible that RapidIO master port drivers and endpoint device drivers will add their device-specific sysfs attributes but such attributes are outside the scope of this document. -4. Switch Device Attributes ---------------------------- +4. RapidIO Switch Device Attributes +----------------------------------- RapidIO switches have additional attributes in sysfs. RapidIO subsystem supports common and device-specific sysfs attributes for switches. Because switches are @@ -106,3 +106,53 @@ attribute: for that controller always will be 0. To initiate RapidIO enumeration/discovery on all available mports a user must write '-1' (or RIO_MPORT_ANY) into this attribute file. + + +6. RapidIO Bus Controllers/Ports +-------------------------------- + +On-chip RapidIO controllers and PCIe-to-RapidIO bridges (referenced as +"Master Port" or "mport") are presented in sysfs as the special class of +devices: "rapidio_port". + +The /sys/class/rapidio_port subdirectory contains individual subdirectories +named as "rapidioN" where N = mport ID registered with RapidIO subsystem. + +NOTE: An mport ID is not a RapidIO destination ID assigned to a given local +mport device. + +Each mport device subdirectory in addition to standard entries contains the +following device-specific attributes: + + port_destid - reports RapidIO destination ID assigned to the given RapidIO + mport device. If value 0xFFFFFFFF is returned this means that + no valid destination ID have been assigned to the mport (yet). + Normally, before enumeration/discovery have been executed only + fabric enumerating mports have a valid destination ID assigned + to them using "hdid=..." rapidio module parameter. + sys_size - reports RapidIO common transport system size: + 0 = small (8-bit destination ID, max. 256 devices), + 1 = large (16-bit destination ID, max. 65536 devices). + +After enumeration or discovery was performed for a given mport device, +the corresponding subdirectory will also contain subdirectories for each +child RapidIO device connected to the mport. Naming conventions for RapidIO +devices are described in Section 1 above. + +The example below shows mport device subdirectory with several child RapidIO +devices attached to it. + +[rio@rapidio ~]$ ls /sys/class/rapidio_port/rapidio0/ -l +total 0 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:e:0001 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:e:0004 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:e:0007 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:s:0002 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:s:0003 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:s:0005 +lrwxrwxrwx 1 root root 0 Feb 11 15:11 device -> ../../../0000:01:00.0 +-r--r--r-- 1 root root 4096 Feb 11 15:11 port_destid +drwxr-xr-x 2 root root 0 Feb 11 15:11 power +lrwxrwxrwx 1 root root 0 Feb 11 15:04 subsystem -> ../../../../../../class/rapidio_port +-r--r--r-- 1 root root 4096 Feb 11 15:11 sys_size +-rw-r--r-- 1 root root 4096 Feb 11 15:04 uevent diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 95dd892e9904..cf2b0840a672 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -531,6 +531,7 @@ int fsl_rio_setup(struct platform_device *dev) sprintf(port->name, "RIO mport %d", i); priv->dev = &dev->dev; + port->dev.parent = &dev->dev; port->ops = ops; port->priv = priv; port->phys_efptr = 0x100; diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 6d1f6ed3113f..a8497183ff8b 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -493,6 +493,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) ndev->netdev_ops = &rionet_netdev_ops; ndev->mtu = RIO_MAX_MSG_SIZE - 14; ndev->features = NETIF_F_LLTX; + SET_NETDEV_DEV(ndev, &mport->dev); SET_ETHTOOL_OPS(ndev, &rionet_ethtool_ops); spin_lock_init(&rnet->lock); diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index ff7cbf2d28e3..1753dc693c15 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -2256,6 +2256,7 @@ static int tsi721_setup_mport(struct tsi721_device *priv) mport->phy_type = RIO_PHY_SERIAL; mport->priv = (void *)priv; mport->phys_efptr = 0x100; + mport->dev.parent = &pdev->dev; priv->mport = mport; INIT_LIST_HEAD(&mport->dbells); diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index c9ae692d3451..f301f059bb85 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -167,7 +167,6 @@ void rio_unregister_driver(struct rio_driver *rdrv) void rio_attach_device(struct rio_dev *rdev) { rdev->dev.bus = &rio_bus_type; - rdev->dev.parent = &rio_bus; } EXPORT_SYMBOL_GPL(rio_attach_device); @@ -216,9 +215,12 @@ static int rio_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -struct device rio_bus = { - .init_name = "rapidio", +struct class rio_mport_class = { + .name = "rapidio_port", + .owner = THIS_MODULE, + .dev_groups = rio_mport_groups, }; +EXPORT_SYMBOL_GPL(rio_mport_class); struct bus_type rio_bus_type = { .name = "rapidio", @@ -233,14 +235,20 @@ struct bus_type rio_bus_type = { /** * rio_bus_init - Register the RapidIO bus with the device model * - * Registers the RIO bus device and RIO bus type with the Linux + * Registers the RIO mport device class and RIO bus type with the Linux * device model. */ static int __init rio_bus_init(void) { - if (device_register(&rio_bus) < 0) - printk("RIO: failed to register RIO bus device\n"); - return bus_register(&rio_bus_type); + int ret; + + ret = class_register(&rio_mport_class); + if (!ret) { + ret = bus_register(&rio_bus_type); + if (ret) + class_unregister(&rio_mport_class); + } + return ret; } postcore_initcall(rio_bus_init); diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index d3a6539a77cc..47a1b2ea76c4 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -461,6 +461,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, rdev->comp_tag & RIO_CTAG_UDEVID); } + rdev->dev.parent = &port->dev; rio_attach_device(rdev); device_initialize(&rdev->dev); diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c index e0221c6d0cc2..cdb005c0094d 100644 --- a/drivers/rapidio/rio-sysfs.c +++ b/drivers/rapidio/rio-sysfs.c @@ -341,3 +341,43 @@ const struct attribute_group *rio_bus_groups[] = { &rio_bus_group, NULL, }; + +static ssize_t +port_destid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct rio_mport *mport = to_rio_mport(dev); + + if (mport) + return sprintf(buf, "0x%04x\n", mport->host_deviceid); + else + return -ENODEV; +} +static DEVICE_ATTR_RO(port_destid); + +static ssize_t sys_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct rio_mport *mport = to_rio_mport(dev); + + if (mport) + return sprintf(buf, "%u\n", mport->sys_size); + else + return -ENODEV; +} +static DEVICE_ATTR_RO(sys_size); + +static struct attribute *rio_mport_attrs[] = { + &dev_attr_port_destid.attr, + &dev_attr_sys_size.attr, + NULL, +}; + +static const struct attribute_group rio_mport_group = { + .attrs = rio_mport_attrs, +}; + +const struct attribute_group *rio_mport_groups[] = { + &rio_mport_group, + NULL, +}; diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 2e8a20cac588..a54ba0494dd3 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -1884,6 +1884,7 @@ static int rio_get_hdid(int index) int rio_register_mport(struct rio_mport *port) { struct rio_scan_node *scan = NULL; + int res = 0; if (next_portid >= RIO_MAX_MPORTS) { pr_err("RIO: reached specified max number of mports\n"); @@ -1894,6 +1895,16 @@ int rio_register_mport(struct rio_mport *port) port->host_deviceid = rio_get_hdid(port->id); port->nscan = NULL; + dev_set_name(&port->dev, "rapidio%d", port->id); + port->dev.class = &rio_mport_class; + + res = device_register(&port->dev); + if (res) + dev_err(&port->dev, "RIO: mport%d registration failed ERR=%d\n", + port->id, res); + else + dev_dbg(&port->dev, "RIO: mport%d registered\n", port->id); + mutex_lock(&rio_mport_list_lock); list_add_tail(&port->node, &rio_mports); diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index 5f99d22ad0b0..2d0550e08ea2 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -50,6 +50,7 @@ extern int rio_mport_scan(int mport_id); /* Structures internal to the RIO core code */ extern const struct attribute_group *rio_dev_groups[]; extern const struct attribute_group *rio_bus_groups[]; +extern const struct attribute_group *rio_mport_groups[]; #define RIO_GET_DID(size, x) (size ? (x & 0xffff) : ((x & 0x00ff0000) >> 16)) #define RIO_SET_DID(size, x) (size ? (x & 0xffff) : ((x & 0x000000ff) << 16)) diff --git a/include/linux/rio.h b/include/linux/rio.h index b71d5738e683..6bda06f21930 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -83,7 +83,7 @@ #define RIO_CTAG_UDEVID 0x0001ffff /* Unique device identifier */ extern struct bus_type rio_bus_type; -extern struct device rio_bus; +extern struct class rio_mport_class; struct rio_mport; struct rio_dev; @@ -201,6 +201,7 @@ struct rio_dev { #define rio_dev_f(n) list_entry(n, struct rio_dev, net_list) #define to_rio_dev(n) container_of(n, struct rio_dev, dev) #define sw_to_rio_dev(n) container_of(n, struct rio_dev, rswitch[0]) +#define to_rio_mport(n) container_of(n, struct rio_mport, dev) /** * struct rio_msg - RIO message event @@ -248,6 +249,7 @@ enum rio_phy_type { * @phy_type: RapidIO phy type * @phys_efptr: RIO port extended features pointer * @name: Port name string + * @dev: device structure associated with an mport * @priv: Master port private data * @dma: DMA device associated with mport * @nscan: RapidIO network enumeration/discovery operations @@ -272,6 +274,7 @@ struct rio_mport { enum rio_phy_type phy_type; /* RapidIO phy type */ u32 phys_efptr; unsigned char name[RIO_MAX_MPORT_NAME]; + struct device dev; void *priv; /* Master port private data */ #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_device dma; -- cgit v1.2.3 From 80df28476505ed4e6701c3448c63c9229a50c655 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Mon, 7 Apr 2014 15:38:57 -0700 Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec" As sysctl_hung_task_timeout_sec is unsigned long, when this value is larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in watchdog will return immediately without sleep and with print : schedule_timeout: wrong timeout value ffffffffffffff83 and then the funtion watchdog will call schedule_timeout_interruptible again and again. The screen will be filled with "schedule_timeout: wrong timeout value ffffffffffffff83" This patch does some check and correction in sysctl, to let the function schedule_timeout_interruptible allways get the valid parameter. Signed-off-by: Liu Hua Tested-by: Satoru Takeuchi Cc: [3.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 1 + kernel/sysctl.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'Documentation') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 271a09db6629..9886c3d57fc2 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -317,6 +317,7 @@ for more than this value report a warning. This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. 0: means infinite timeout - no checking done. +Possible values to set are in range {0..LONG_MAX/HZ}. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5c14b547882e..74f5b580fe34 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include #endif @@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", -- cgit v1.2.3 From 8ca577223f75230a746a06f4566c53943f78d5d0 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:01 -0700 Subject: affs: add mount option to avoid filename truncates Normal behavior for filenames exceeding specific filesystem limits is to refuse operation. AFFS standard name length being only 30 characters against 255 for usual Linux filesystems, original implementation does filename truncate by default with a define value AFFS_NO_TRUNCATE which can be enabled but needs module compilation. This patch adds 'nofilenametruncate' mount option so that user can easily activate that feature and avoid a lot of problems (eg overwrite files ...) Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/affs.txt | 9 ++++++--- fs/affs/affs.h | 20 ++++++++------------ fs/affs/amigaffs.c | 23 +++++++++++++++-------- fs/affs/namei.c | 32 +++++++++++++++++++++++--------- fs/affs/super.c | 6 +++++- 5 files changed, 57 insertions(+), 33 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt index 81ac488e3758..71b63c2b9841 100644 --- a/Documentation/filesystems/affs.txt +++ b/Documentation/filesystems/affs.txt @@ -49,6 +49,10 @@ mode=mode Sets the mode flags to the given (octal) value, regardless This is useful since most of the plain AmigaOS files will map to 600. +nofilenametruncate + The file system will return an error when filename exceeds + standard maximum filename length (30 characters). + reserved=num Sets the number of reserved blocks at the start of the partition to num. You should never need this option. Default is 2. @@ -181,9 +185,8 @@ tested, though several hundred MB have been read and written using this fs. For a most up-to-date list of bugs please consult fs/affs/Changes. -Filenames are truncated to 30 characters without warning (this -can be changed by setting the compile-time option AFFS_NO_TRUNCATE -in include/linux/amigaffs.h). +By default, filenames are truncated to 30 characters without warning. +'nofilenametruncate' mount option can change that behavior. Case is ignored by the affs in filename matching, but Linux shells do care about the case. Example (with /wb being an affs mounted fs): diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 3952121f2f28..25b23b1e7f22 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -5,14 +5,6 @@ #include #include -/* AmigaOS allows file names with up to 30 characters length. - * Names longer than that will be silently truncated. If you - * want to disallow this, comment out the following #define. - * Creating filesystem objects with longer names will then - * result in an error (ENAMETOOLONG). - */ -/*#define AFFS_NO_TRUNCATE */ - /* Ugly macros make the code more pretty. */ #define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) @@ -28,7 +20,6 @@ #define AFFS_CACHE_SIZE PAGE_SIZE -#define AFFS_MAX_PREALLOC 32 #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) @@ -118,6 +109,7 @@ struct affs_sb_info { #define SF_OFS 0x0200 /* Old filesystem */ #define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) @@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); -extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); -extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); -extern int affs_check_name(const unsigned char *name, int len); +extern void affs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +extern void affs_warning(struct super_block *sb, const char *function, + const char *fmt, ...); +extern bool affs_nofilenametruncate(const struct dentry *dentry); +extern int affs_check_name(const unsigned char *name, int len, + bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d9a43674cb94..533a322c41c0 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) function,ErrorBuffer); } +bool +affs_nofilenametruncate(const struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE; + +} + /* Check if the name is valid for a affs object. */ int -affs_check_name(const unsigned char *name, int len) +affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) -#ifdef AFFS_NO_TRUNCATE - return -ENAMETOOLONG; -#else - len = 30; -#endif - + if (len > 30) { + if (notruncate) + return -ENAMETOOLONG; + else + len = 30; + } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' || (name[i] > 0x7e && name[i] < 0xa0)) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index c36cbb4537a2..6dae1ccd176d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name, qstr->len); + i = affs_check_name(qstr->name, qstr->len, notruncate); if (i) return i; @@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper, + affs_nofilenametruncate(dentry)); + } + static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper, + affs_nofilenametruncate(dentry)); + } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper) + const char *str, const struct qstr *name, toupper_t toupper, + bool notruncate) { const u8 *aname = str; const u8 *bname = name->name; @@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len, * must be valid. 'name' must be validated first. */ - if (affs_check_name(name->name, name->len)) + if (affs_check_name(name->name, name->len, notruncate)) return 1; /* @@ -126,13 +132,18 @@ static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_toupper); + + return __affs_compare_dentry(len, str, name, affs_toupper, + affs_nofilenametruncate(parent)); } + static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper, + affs_nofilenametruncate(parent)); + } /* @@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); - retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); + retval = affs_check_name(new_dentry->d_name.name, + new_dentry->d_name.len, + affs_nofilenametruncate(old_dentry)); + if (retval) return retval; diff --git a/fs/affs/super.c b/fs/affs/super.c index 4fad16adbe7b..6d589f28bf9b 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -163,7 +163,7 @@ static const struct super_operations affs_sops = { }; enum { - Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, + Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; @@ -172,6 +172,7 @@ static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, + {Opt_notruncate, "nofilenametruncate"}, {Opt_prefix, "prefix=%s"}, {Opt_protect, "protect"}, {Opt_reserved, "reserved=%u"}, @@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, case Opt_mufs: *mount_opts |= SF_MUFS; break; + case Opt_notruncate: + *mount_opts |= SF_NO_TRUNCATE; + break; case Opt_prefix: *prefix = match_strdup(&args[0]); if (!*prefix) -- cgit v1.2.3 From 5d2acfc7b974bbd3858b4dd3f2cdc6362dd8843a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:09 -0700 Subject: kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT "make allnoconfig" exists to ease testing of minimal configurations. Documentation/SubmitChecklist includes a note to test with allnoconfig. This helps catch missing dependencies on common-but-not-required functionality, which might otherwise go unnoticed. However, allnoconfig still leaves many symbols enabled, because they're hidden behind CONFIG_EMBEDDED or CONFIG_EXPERT. For instance, allnoconfig still has CONFIG_PRINTK and CONFIG_BLOCK enabled, so drivers don't typically get build-tested with those disabled. To address this, introduce a new Kconfig option "allnoconfig_y", used on symbols which only exist to hide other symbols. Set it on CONFIG_EMBEDDED (which then selects CONFIG_EXPERT). allnoconfig will then disable all the symbols hidden behind those. Signed-off-by: Josh Triplett Tested-by: Paul E. McKenney Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kbuild/kconfig-language.txt | 4 ++++ init/Kconfig | 1 + scripts/kconfig/confdata.c | 5 ++++- scripts/kconfig/expr.h | 3 +++ scripts/kconfig/lkc.h | 1 + scripts/kconfig/menu.c | 3 +++ scripts/kconfig/zconf.gperf | 1 + scripts/kconfig/zconf.hash.c_shipped | 13 ++++++++----- 8 files changed, 25 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index c420676c6fe3..350f733bf2c7 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -157,6 +157,10 @@ applicable everywhere (see syntax). to the build environment (if this is desired, it can be done via another symbol). + - "allnoconfig_y" + This declares the symbol as one that should have the value y when + using "allnoconfig". Used for symbols that hide other symbols. + Menu dependencies ----------------- diff --git a/init/Kconfig b/init/Kconfig index 8851c6417880..427ba60d638f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1483,6 +1483,7 @@ config PCI_QUIRKS config EMBEDDED bool "Embedded system" + option allnoconfig_y select EXPERT help This option should be enabled if compiling the kernel for diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 87f723804079..f88d90f20228 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -1178,7 +1178,10 @@ bool conf_set_all_new_symbols(enum conf_def_mode mode) sym->def[S_DEF_USER].tri = mod; break; case def_no: - sym->def[S_DEF_USER].tri = no; + if (sym->flags & SYMBOL_ALLNOCONFIG_Y) + sym->def[S_DEF_USER].tri = yes; + else + sym->def[S_DEF_USER].tri = no; break; case def_random: sym->def[S_DEF_USER].tri = no; diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index ba663e1dc7e3..412ea8a2abb8 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -109,6 +109,9 @@ struct symbol { /* choice values need to be set before calculating this symbol value */ #define SYMBOL_NEED_SET_CHOICE_VALUES 0x100000 +/* Set symbol to y if allnoconfig; used for symbols that hide others */ +#define SYMBOL_ALLNOCONFIG_Y 0x200000 + #define SYMBOL_MAXLENGTH 256 #define SYMBOL_HASHSIZE 9973 diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h index 09f4edfdc911..d5daa7af8b49 100644 --- a/scripts/kconfig/lkc.h +++ b/scripts/kconfig/lkc.h @@ -61,6 +61,7 @@ enum conf_def_mode { #define T_OPT_MODULES 1 #define T_OPT_DEFCONFIG_LIST 2 #define T_OPT_ENV 3 +#define T_OPT_ALLNOCONFIG_Y 4 struct kconf_id { int name; diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index db1512ae30cc..3ac2c9c6e280 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -217,6 +217,9 @@ void menu_add_option(int token, char *arg) case T_OPT_ENV: prop_add_env(arg); break; + case T_OPT_ALLNOCONFIG_Y: + current_entry->sym->flags |= SYMBOL_ALLNOCONFIG_Y; + break; } } diff --git a/scripts/kconfig/zconf.gperf b/scripts/kconfig/zconf.gperf index f14ab41154b6..b6ac02d604f1 100644 --- a/scripts/kconfig/zconf.gperf +++ b/scripts/kconfig/zconf.gperf @@ -44,4 +44,5 @@ on, T_ON, TF_PARAM modules, T_OPT_MODULES, TF_OPTION defconfig_list, T_OPT_DEFCONFIG_LIST,TF_OPTION env, T_OPT_ENV, TF_OPTION +allnoconfig_y, T_OPT_ALLNOCONFIG_Y,TF_OPTION %% diff --git a/scripts/kconfig/zconf.hash.c_shipped b/scripts/kconfig/zconf.hash.c_shipped index 40df0005daa9..c77a8eff1ef2 100644 --- a/scripts/kconfig/zconf.hash.c_shipped +++ b/scripts/kconfig/zconf.hash.c_shipped @@ -55,10 +55,10 @@ kconf_id_hash (register const char *str, register unsigned int len) 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, - 73, 73, 73, 73, 73, 73, 73, 73, 25, 25, + 73, 73, 73, 73, 73, 73, 73, 5, 25, 25, 0, 0, 0, 5, 0, 0, 73, 73, 5, 0, 10, 5, 45, 73, 20, 20, 0, 15, 15, 73, - 20, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 20, 5, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, @@ -106,6 +106,7 @@ struct kconf_id_strings_t char kconf_id_strings_str23[sizeof("mainmenu")]; char kconf_id_strings_str25[sizeof("menuconfig")]; char kconf_id_strings_str27[sizeof("modules")]; + char kconf_id_strings_str28[sizeof("allnoconfig_y")]; char kconf_id_strings_str29[sizeof("menu")]; char kconf_id_strings_str31[sizeof("select")]; char kconf_id_strings_str32[sizeof("comment")]; @@ -141,6 +142,7 @@ static const struct kconf_id_strings_t kconf_id_strings_contents = "mainmenu", "menuconfig", "modules", + "allnoconfig_y", "menu", "select", "comment", @@ -170,7 +172,7 @@ kconf_id_lookup (register const char *str, register unsigned int len) { enum { - TOTAL_KEYWORDS = 32, + TOTAL_KEYWORDS = 33, MIN_WORD_LENGTH = 2, MAX_WORD_LENGTH = 14, MIN_HASH_VALUE = 2, @@ -219,7 +221,8 @@ kconf_id_lookup (register const char *str, register unsigned int len) {-1}, #line 44 "scripts/kconfig/zconf.gperf" {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str27, T_OPT_MODULES, TF_OPTION}, - {-1}, +#line 47 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str28, T_OPT_ALLNOCONFIG_Y,TF_OPTION}, #line 16 "scripts/kconfig/zconf.gperf" {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str29, T_MENU, TF_COMMAND}, {-1}, @@ -282,5 +285,5 @@ kconf_id_lookup (register const char *str, register unsigned int len) } return 0; } -#line 47 "scripts/kconfig/zconf.gperf" +#line 48 "scripts/kconfig/zconf.gperf" -- cgit v1.2.3 From 956632857819747466e27037aa8a57e8165213c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Apr 2014 15:39:21 -0700 Subject: asm/system.h: clean asm/system.h from docs Clean asm/system.h from docs as nothing should refer to that header anymore. Signed-off-by: David Howells Cc: Ingo Molnar Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DocBook/kernel-hacking.tmpl | 2 +- Documentation/irqflags-tracing.txt | 7 ------- Documentation/scheduler/sched-arch.txt | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index d0758b241b23..b90959ba37e4 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -671,7 +671,7 @@ printk(KERN_INFO "my ip: %pI4\n", &ipaddress); <function>local_irq_save()</function>/<function>local_irq_restore()</function> - <filename class="headerfile">include/asm/system.h</filename> + <filename class="headerfile">include/linux/irqflags.h</filename> diff --git a/Documentation/irqflags-tracing.txt b/Documentation/irqflags-tracing.txt index 67aa71e73035..f6da05670e16 100644 --- a/Documentation/irqflags-tracing.txt +++ b/Documentation/irqflags-tracing.txt @@ -22,13 +22,6 @@ rather straightforward and risk-free manner. Architectures that want to support this need to do a couple of code-organizational changes first: -- move their irq-flags manipulation code from their asm/system.h header - to asm/irqflags.h - -- rename local_irq_disable()/etc to raw_local_irq_disable()/etc. so that - the linux/irqflags.h code can inject callbacks and can construct the - real local_irq_disable()/etc APIs. - - add and enable TRACE_IRQFLAGS_SUPPORT in their arch level Kconfig file and then a couple of functional changes are needed as well to implement diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 9290de703450..a2f27bbf2cba 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt @@ -8,7 +8,7 @@ Context switch By default, the switch_to arch function is called with the runqueue locked. This is usually not a problem unless switch_to may need to take the runqueue lock. This is usually due to a wake up operation in -the context switch. See arch/ia64/include/asm/system.h for an example. +the context switch. See arch/ia64/include/asm/switch_to.h for an example. To request the scheduler call switch_to with the runqueue unlocked, you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file -- cgit v1.2.3 From bf4b558eba920a38f91beb5ee62a8ce2628c92f7 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:52 -0700 Subject: arm64: add early_ioremap support Add support for early IO or memory mappings which are needed before the normal ioremap() is usable. This also adds fixmap support for permanent fixed mappings such as that used by the earlyprintk device register region. Signed-off-by: Mark Salter Acked-by: Catalin Marinas Cc: Borislav Petkov Cc: Dave Young Cc: H. Peter Anvin Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/arm64/memory.txt | 4 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/Kbuild | 1 + arch/arm64/include/asm/fixmap.h | 67 +++++++++++++++++++++++++++++++ arch/arm64/include/asm/io.h | 1 + arch/arm64/include/asm/memory.h | 2 +- arch/arm64/kernel/early_printk.c | 8 +++- arch/arm64/kernel/head.S | 9 ++--- arch/arm64/kernel/setup.c | 2 + arch/arm64/mm/ioremap.c | 85 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/mmu.c | 41 ------------------- 11 files changed, 169 insertions(+), 52 deletions(-) create mode 100644 arch/arm64/include/asm/fixmap.h (limited to 'Documentation') diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt index 85e24c4f215c..d50fa618371b 100644 --- a/Documentation/arm64/memory.txt +++ b/Documentation/arm64/memory.txt @@ -39,7 +39,7 @@ ffffffbffa000000 ffffffbffaffffff 16MB PCI I/O space ffffffbffb000000 ffffffbffbbfffff 12MB [guard] -ffffffbffbc00000 ffffffbffbdfffff 2MB earlyprintk device +ffffffbffbc00000 ffffffbffbdfffff 2MB fixed mappings ffffffbffbe00000 ffffffbffbffffff 2MB [guard] @@ -66,7 +66,7 @@ fffffdfffa000000 fffffdfffaffffff 16MB PCI I/O space fffffdfffb000000 fffffdfffbbfffff 12MB [guard] -fffffdfffbc00000 fffffdfffbdfffff 2MB earlyprintk device +fffffdfffbc00000 fffffdfffbdfffff 2MB fixed mappings fffffdfffbe00000 fffffdfffbffffff 2MB [guard] diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8079a23e2701..e6e4d3749a6e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CPU_AUTOPROBE + select GENERIC_EARLY_IOREMAP select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 4bca4923fc0b..83f71b3004a8 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -10,6 +10,7 @@ generic-y += delay.h generic-y += div64.h generic-y += dma.h generic-y += emergency-restart.h +generic-y += early_ioremap.h generic-y += errno.h generic-y += ftrace.h generic-y += hash.h diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h new file mode 100644 index 000000000000..5f7bfe6df723 --- /dev/null +++ b/arch/arm64/include/asm/fixmap.h @@ -0,0 +1,67 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * Copyright (C) 2013 Mark Salter + * + * Adapted from arch/x86_64 version. + * + */ + +#ifndef _ASM_ARM64_FIXMAP_H +#define _ASM_ARM64_FIXMAP_H + +#ifndef __ASSEMBLY__ +#include +#include + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * + * These 'compile-time allocated' memory buffers are + * page-sized. Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + */ +enum fixed_addresses { + FIX_EARLYCON_MEM_BASE, + __end_of_permanent_fixed_addresses, + + /* + * Temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + */ +#ifdef CONFIG_ARM64_64K_PAGES +#define NR_FIX_BTMAPS 4 +#else +#define NR_FIX_BTMAPS 64 +#endif +#define FIX_BTMAPS_SLOTS 7 +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + __end_of_fixed_addresses +}; + +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +#define FIXMAP_PAGE_IO __pgprot(PROT_DEVICE_nGnRE) + +extern void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + +#define __set_fixmap __early_set_fixmap + +#include + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_ARM64_FIXMAP_H */ diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 7846a6bb0833..a1bef78f0303 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -27,6 +27,7 @@ #include #include #include +#include #include diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 9dc5dc39fded..e94f9458aa6f 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -49,7 +49,7 @@ #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) #define MODULES_END (PAGE_OFFSET) #define MODULES_VADDR (MODULES_END - SZ_64M) -#define EARLYCON_IOBASE (MODULES_VADDR - SZ_4M) +#define FIXADDR_TOP (MODULES_VADDR - SZ_2M - PAGE_SIZE) #define TASK_SIZE_64 (UL(1) << VA_BITS) #ifdef CONFIG_COMPAT diff --git a/arch/arm64/kernel/early_printk.c b/arch/arm64/kernel/early_printk.c index fbb6e1843659..ffbbdde7aba1 100644 --- a/arch/arm64/kernel/early_printk.c +++ b/arch/arm64/kernel/early_printk.c @@ -26,6 +26,8 @@ #include #include +#include + static void __iomem *early_base; static void (*printch)(char ch); @@ -141,8 +143,10 @@ static int __init setup_early_printk(char *buf) } /* no options parsing yet */ - if (paddr) - early_base = early_io_map(paddr, EARLYCON_IOBASE); + if (paddr) { + set_fixmap_io(FIX_EARLYCON_MEM_BASE, paddr); + early_base = (void __iomem *)fix_to_virt(FIX_EARLYCON_MEM_BASE); + } printch = match->printch; early_console = &early_console_dev; diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 61035d6814cb..1fe5d8d2bdfd 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -404,7 +404,7 @@ ENDPROC(__calc_phys_offset) * - identity mapping to enable the MMU (low address, TTBR0) * - first few MB of the kernel linear mapping to jump to once the MMU has * been enabled, including the FDT blob (TTBR1) - * - UART mapping if CONFIG_EARLY_PRINTK is enabled (TTBR1) + * - pgd entry for fixed mappings (TTBR1) */ __create_page_tables: pgtbl x25, x26, x24 // idmap_pg_dir and swapper_pg_dir addresses @@ -461,15 +461,12 @@ __create_page_tables: sub x6, x6, #1 // inclusive range create_block_map x0, x7, x3, x5, x6 1: -#ifdef CONFIG_EARLY_PRINTK /* - * Create the pgd entry for the UART mapping. The full mapping is done - * later based earlyprintk kernel parameter. + * Create the pgd entry for the fixed mappings. */ - ldr x5, =EARLYCON_IOBASE // UART virtual address + ldr x5, =FIXADDR_TOP // Fixed mapping virtual address add x0, x26, #2 * PAGE_SIZE // section table address create_pgd_entry x26, x0, x5, x6, x7 -#endif ret ENDPROC(__create_page_tables) .ltorg diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 20830d1afbb6..720853f70b6b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -361,6 +362,7 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; init_mem_pgprot(); + early_ioremap_init(); parse_early_param(); diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 2bb1d586664c..7ec328392ae0 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -25,6 +25,10 @@ #include #include +#include +#include +#include + static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size, pgprot_t prot, void *caller) { @@ -98,3 +102,84 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); + +#ifndef CONFIG_ARM64_64K_PAGES +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; +#endif + +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(addr); + BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd)); + + pud = pud_offset(pgd, addr); + BUG_ON(pud_none(*pud) || pud_bad(*pud)); + + return pmd_offset(pud, addr); +} + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + pmd_t *pmd = early_ioremap_pmd(addr); + + BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd)); + + return pte_offset_kernel(pmd, addr); +} + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); +#ifndef CONFIG_ARM64_64K_PAGES + /* need to populate pmd for 4k pagesize only */ + pmd_populate_kernel(&init_mm, pmd, bm_pte); +#endif + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); + + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + pr_warn("pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + pr_warn("FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } + + early_ioremap_setup(); +} + +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + + pte = early_ioremap_pte(addr); + + if (pgprot_val(flags)) + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); + else { + pte_clear(&init_mm, addr, pte); + flush_tlb_kernel_range(addr, addr+PAGE_SIZE); + } +} diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ba259a0e385e..6b7e89569a3a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -260,47 +260,6 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, } while (pgd++, addr = next, addr != end); } -#ifdef CONFIG_EARLY_PRINTK -/* - * Create an early I/O mapping using the pgd/pmd entries already populated - * in head.S as this function is called too early to allocated any memory. The - * mapping size is 2MB with 4KB pages or 64KB or 64KB pages. - */ -void __iomem * __init early_io_map(phys_addr_t phys, unsigned long virt) -{ - unsigned long size, mask; - bool page64k = IS_ENABLED(CONFIG_ARM64_64K_PAGES); - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* - * No early pte entries with !ARM64_64K_PAGES configuration, so using - * sections (pmd). - */ - size = page64k ? PAGE_SIZE : SECTION_SIZE; - mask = ~(size - 1); - - pgd = pgd_offset_k(virt); - pud = pud_offset(pgd, virt); - if (pud_none(*pud)) - return NULL; - pmd = pmd_offset(pud, virt); - - if (page64k) { - if (pmd_none(*pmd)) - return NULL; - pte = pte_offset_kernel(pmd, virt); - set_pte(pte, __pte((phys & mask) | PROT_DEVICE_nGnRE)); - } else { - set_pmd(pmd, __pmd((phys & mask) | PROT_SECT_DEVICE_nGnRE)); - } - - return (void __iomem *)((virt & mask) + (phys & ~mask)); -} -#endif - static void __init map_mem(void) { struct memblock_region *reg; -- cgit v1.2.3 From 56aeeba8c1a21211aebe724a623acafe760912a2 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:53 -0700 Subject: doc/kernel-parameters.txt: add early_ioremap_debug Add description of early_ioremap_debug kernel parameter. Signed-off-by: Mark Salter Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Young Cc: H. Peter Anvin Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bc3478581f67..b6c67d592be5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -884,6 +884,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Enable debug messages at boot time. See Documentation/dynamic-debug-howto.txt for details. + early_ioremap_debug [KNL] + Enable debug messages in early_ioremap support. This + is useful for tracking down temporary early mappings + which are not unmapped. + earlycon= [KNL] Output early console device and options. uart[8250],io,[,options] uart[8250],mmio,[,options] -- cgit v1.2.3