diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-14 20:10:55 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-14 20:10:55 +0300 |
commit | 318222a35bfb0ae9b5ff3e359a583463e6cfcd94 (patch) | |
tree | 6a8d921f7ac9915f3f12dd3fcd7efaaf6f16bb09 /Documentation | |
parent | 7e9890a3500d95c01511a4c45b7e7192dfa47ae2 (diff) | |
parent | 640be2d1ffbc1946f1547eb89b5005ed7542de99 (diff) | |
download | linux-318222a35bfb0ae9b5ff3e359a583463e6cfcd94.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
- a few misc things and hotfixes
- ocfs2
- almost all of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (139 commits)
kernel/memremap.c: remove the unused device_private_entry_fault() export
mm: delete find_get_entries_tag
mm/huge_memory.c: make __thp_get_unmapped_area static
mm/mprotect.c: fix compilation warning because of unused 'mm' variable
mm/page-writeback: introduce tracepoint for wait_on_page_writeback()
mm/vmscan: simplify trace_reclaim_flags and trace_shrink_flags
mm/Kconfig: update "Memory Model" help text
mm/vmscan.c: don't disable irq again when count pgrefill for memcg
mm: memblock: make keeping memblock memory opt-in rather than opt-out
hugetlbfs: always use address space in inode for resv_map pointer
mm/z3fold.c: support page migration
mm/z3fold.c: add structure for buddy handles
mm/z3fold.c: improve compression by extending search
mm/z3fold.c: introduce helper functions
mm/page_alloc.c: remove unnecessary parameter in rmqueue_pcplist
mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig
mm/vmscan.c: simplify shrink_inactive_list()
fs/sync.c: sync_file_range(2) may use WB_SYNC_ALL writeback
xen/privcmd-buf.c: convert to use vm_map_pages_zero()
xen/gntdev.c: convert to use vm_map_pages()
...
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/sysctl/vm.txt | 12 | ||||
-rw-r--r-- | Documentation/trace/postprocess/trace-vmscan-postprocess.pl | 7 | ||||
-rw-r--r-- | Documentation/vm/hmm.rst | 94 |
3 files changed, 91 insertions, 22 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3f13d8599337..749322060f10 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - stat_refresh - numa_stat - swappiness +- unprivileged_userfaultfd - user_reserve_kbytes - vfs_cache_pressure - watermark_boost_factor @@ -818,6 +819,17 @@ The default value is 60. ============================================================== +unprivileged_userfaultfd + +This flag controls whether unprivileged users can use the userfaultfd +system calls. Set this to 1 to allow unprivileged users to use the +userfaultfd system calls, or set this to 0 to restrict userfaultfd to only +privileged users (with SYS_CAP_PTRACE capability). + +The default value is 1. + +============================================================== + - user_reserve_kbytes When overcommit_memory is set to 2, "never overcommit" mode, reserve diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 66bfd8396877..995da15b16ca 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", - "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_congested", "nr_immediate", "nr_activate_anon", + "nr_activate_file", "nr_ref_keep", "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", @@ -407,7 +408,7 @@ EVENT_PROCESS: } my $nr_reclaimed = $3; - my $flags = $12; + my $flags = $13; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 44205f0b671f..ec1efa32af3c 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -189,20 +189,10 @@ the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use either:: - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); - int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); - -The first one (hmm_vma_get_pfns()) will only fetch present CPU page table + long hmm_range_snapshot(struct hmm_range *range); + long hmm_range_fault(struct hmm_range *range, bool block); + +The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path @@ -220,25 +210,56 @@ respect in order to keep things properly synchronized. The usage pattern is:: { struct hmm_range range; ... + + range.start = ...; + range.end = ...; + range.pfns = ...; + range.flags = ...; + range.values = ...; + range.pfn_shift = ...; + hmm_range_register(&range); + + /* + * Just wait for range to be valid, safe to ignore return value as we + * will use the return value of hmm_range_snapshot() below under the + * mmap_sem to ascertain the validity of the range. + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); + again: - ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); - if (ret) + down_read(&mm->mmap_sem); + ret = hmm_range_snapshot(&range); + if (ret) { + up_read(&mm->mmap_sem); + if (ret == -EAGAIN) { + /* + * No need to check hmm_range_wait_until_valid() return value + * on retry we will get proper error with hmm_range_snapshot() + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); + goto again; + } + hmm_mirror_unregister(&range); return ret; + } take_lock(driver->update); - if (!hmm_vma_range_done(vma, &range)) { + if (!range.valid) { release_lock(driver->update); + up_read(&mm->mmap_sem); goto again; } // Use pfns array content to update device page table + hmm_mirror_unregister(&range); release_lock(driver->update); + up_read(&mm->mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before hmm_vma_range_done() to avoid -any race with a concurrent CPU page table update. +update() callback. That lock must be held before checking the range.valid +field to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing @@ -255,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this concurrently). +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows +to set fault or snapshot policy for a whole range instead of having to set them +for each entries in the range. + +For instance if the device flags for device entries are: + VALID (1 << 63) + WRITE (1 << 62) + +Now let say that device driver wants to fault with at least read a range then +it does set: + range->default_flags = (1 << 63) + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all page +in the range with at least read permission. + +Now let say driver wants to do the same except for one page in the range for +which its want to have write. Now driver set: + range->default_flags = (1 << 63); + range->pfn_flags_mask = (1 << 62); + range->pfns[index_of_write] = (1 << 62); + +With this HMM will fault in all page with at least read (ie valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission ie if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +Note that HMM will populate the pfns array with write permission for any entry +that have write permission within the CPU pte no matter what are the values set +in default_flags or pfn_flags_mask. + + Represent and manage device memory from core kernel point of view ================================================================= |