From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5c757fba8858..a7025ed5c65b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); - } while ((vma = mas_find(mas, tree_end - 1)) != NULL); + vma = mas_find(mas, tree_end - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } -- cgit v1.2.3 From 01d1e0e6b7d99ebaf2e42d2205595080b7d0c271 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:05 +0000 Subject: mm: convert __do_fault() to use a folio Convert vmf->page to a folio as soon as we're going to use it. This fixes a bug if the fault handler returns a tail page with hardware poison; tail pages have an invalid page->index, so we would fail to unmap the page from the page tables. We actually have to unmap the entire folio (or mapping_evict_folio() will fail), so use unmap_mapping_folio() instead. This also saves various calls to compound_head() hidden in lock_page(), put_page(), etc. Link: https://lkml.kernel.org/r/20231108182809.602073-3-willy@infradead.org Fixes: 793917d997df ("mm/readahead: Add large folio readahead") Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index a7025ed5c65b..e27e2e5beb3f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4245,6 +4245,7 @@ oom: static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; vm_fault_t ret; /* @@ -4273,27 +4274,26 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) VM_FAULT_DONE_COW))) return ret; + folio = page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { - struct page *page = vmf->page; vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { - if (page_mapped(page)) - unmap_mapping_pages(page_mapping(page), - page->index, 1, false); - /* Retry if a clean page was removed from the cache. */ - if (invalidate_inode_page(page)) + if (page_mapped(vmf->page)) + unmap_mapping_folio(folio); + /* Retry if a clean folio was removed from the cache. */ + if (mapping_evict_folio(folio->mapping, folio)) poisonret = VM_FAULT_NOPAGE; - unlock_page(page); + folio_unlock(folio); } - put_page(page); + folio_put(folio); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf->page); + folio_lock(folio); else - VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); + VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); return ret; } -- cgit v1.2.3 From 24d2613a6356f9c4a0b1b8e17f125562f6c8e11b Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 20 Nov 2023 15:24:05 +0100 Subject: mm/memory: use kmap_local_page() in __wp_page_copy_user() kmap_atomic() has been deprecated in favor of kmap_local_{folio,page}. Therefore, replace kmap_atomic() with kmap_local_page in __wp_page_copy_user(). kmap_atomic() disables preemption in !PREEMPT_RT kernels and unconditionally disables also page-faults. My limited knowledge of the implementation of __wp_page_copy_user() makes me think that the latter side effect is still needed here, but kmap_local_page() is implemented not to disable page-faults. So, in addition to the conversion to local mapping, add explicit pagefault_disable() / pagefault_enable() between mapping and un-mapping. Link: https://lkml.kernel.org/r/20231120142418.6977-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e27e2e5beb3f..a8ff3489211b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2841,7 +2841,8 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ - kaddr = kmap_atomic(dst); + kaddr = kmap_local_page(dst); + pagefault_disable(); uaddr = (void __user *)(addr & PAGE_MASK); /* @@ -2909,7 +2910,8 @@ warn: pte_unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); - kunmap_atomic(kaddr); + pagefault_enable(); + kunmap_local(kaddr); flush_dcache_page(dst); return ret; -- cgit v1.2.3 From f8b6187d8dd98fd32fe393071f362a7b6beaad0a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:29 +0800 Subject: mm: memory: use a folio in validate_page_before_insert() Use a folio in validate_page_before_insert() to save two compound_head() calls. Link: https://lkml.kernel.org/r/20231118023232.1409103-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sidhartha Kumar Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index a8ff3489211b..5a917b21a122 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1842,9 +1842,12 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, static int validate_page_before_insert(struct page *page) { - if (PageAnon(page) || PageSlab(page) || page_has_type(page)) + struct folio *folio = page_folio(page); + + if (folio_test_anon(folio) || folio_test_slab(folio) || + page_has_type(page)) return -EINVAL; - flush_dcache_page(page); + flush_dcache_folio(folio); return 0; } -- cgit v1.2.3 From 294de6d8f14a69f1251b94223ba9d90d64b28cec Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:30 +0800 Subject: mm: memory: rename page_copy_prealloc() to folio_prealloc() Let's rename page_copy_prealloc() to folio_prealloc(), which could be reused in more functons, as it maybe zero the new page, pass a new need_zero to it, and call the vma_alloc_zeroed_movable_folio() if need_zero is true. Link: https://lkml.kernel.org/r/20231118023232.1409103-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sidhartha Kumar Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5a917b21a122..32fb7e066197 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -992,12 +992,17 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } -static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, - struct vm_area_struct *vma, unsigned long addr) +static inline struct folio *folio_prealloc(struct mm_struct *src_mm, + struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (need_zero) + new_folio = vma_alloc_zeroed_movable_folio(vma, addr); + else + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr, false); + if (!new_folio) return NULL; @@ -1129,7 +1134,7 @@ again: } else if (ret == -EBUSY) { goto out; } else if (ret == -EAGAIN) { - prealloc = page_copy_prealloc(src_mm, src_vma, addr); + prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret) { -- cgit v1.2.3 From e4621e70469c3ac6e1b6914f1c42941a8a6e44d2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:31 +0800 Subject: mm: memory: use a folio in do_cow_fault() Use folio_prealloc() helper and convert to use a folio in do_cow_fault(), which save five compound_head() calls. Link: https://lkml.kernel.org/r/20231118023232.1409103-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 32fb7e066197..3d92b4a7b6e4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4656,6 +4656,7 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); @@ -4664,16 +4665,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (ret) return ret; - vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); - if (!vmf->cow_page) + folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); + if (!folio) return VM_FAULT_OOM; - if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, - GFP_KERNEL)) { - put_page(vmf->cow_page); - return VM_FAULT_OOM; - } - folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL); + vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -4682,7 +4678,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) return ret; copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); - __SetPageUptodate(vmf->cow_page); + __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock_page(vmf->page); @@ -4691,7 +4687,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - put_page(vmf->cow_page); + folio_put(folio); return ret; } -- cgit v1.2.3 From cf503cc665c442ce9893cb12561c57a328465e29 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:32 +0800 Subject: mm: memory: use folio_prealloc() in wp_page_copy() Use folio_prealloc() helper to simplify code a bit. Link: https://lkml.kernel.org/r/20231118023232.1409103-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 3d92b4a7b6e4..99582b188ed2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3117,6 +3117,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; + bool pfn_is_zero; delayacct_wpcopy_start(); @@ -3126,16 +3127,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (unlikely(ret)) goto out; - if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); - if (!new_folio) - goto oom; - } else { + pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); + new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); + if (!new_folio) + goto oom; + + if (!pfn_is_zero) { int err; - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); - if (!new_folio) - goto oom; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { @@ -3156,10 +3154,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) kmsan_copy_page_meta(&new_folio->page, vmf->page); } - if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) - goto oom_free_new; - folio_throttle_swaprate(new_folio, GFP_KERNEL); - __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, @@ -3258,8 +3252,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; -oom_free_new: - folio_put(new_folio); oom: ret = VM_FAULT_OOM; out: -- cgit v1.2.3 From 3485b88390b0af9e05dc2c3f57e9936f41e159a0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:04 +0000 Subject: mm: thp: introduce multi-size THP sysfs interface In preparation for adding support for anonymous multi-size THP, introduce new sysfs structure that will be used to control the new behaviours. A new directory is added under transparent_hugepage for each supported THP size, and contains an `enabled` file, which can be set to "inherit" (to inherit the global setting), "always", "madvise" or "never". For now, the kernel still only supports PMD-sized anonymous THP, so only 1 directory is populated. The first half of the change converts transhuge_vma_suitable() and hugepage_vma_check() so that they take a bitfield of orders for which the user wants to determine support, and the functions filter out all the orders that can't be supported, given the current sysfs configuration and the VMA dimensions. The resulting functions are renamed to thp_vma_suitable_orders() and thp_vma_allowable_orders() respectively. Convenience functions that take a single, unencoded order and return a boolean are also defined as thp_vma_suitable_order() and thp_vma_allowable_order(). The second half of the change implements the new sysfs interface. It has been done so that each supported THP size has a `struct thpsize`, which describes the relevant metadata and is itself a kobject. This is pretty minimal for now, but should make it easy to add new per-thpsize files to the interface if needed in future (e.g. per-size defrag). Rather than keep the `enabled` state directly in the struct thpsize, I've elected to directly encode it into huge_anon_orders_[always|madvise|inherit] bitfields since this reduces the amount of work required in thp_vma_allowable_orders() which is called for every page fault. See Documentation/admin-guide/mm/transhuge.rst, as modified by this commit, for details of how the new sysfs interface works. [ryan.roberts@arm.com: fix build warning when CONFIG_SYSFS is disabled] Link: https://lkml.kernel.org/r/20231211125320.3997543-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Barry Song Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 97 +++++++++--- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 3 +- include/linux/huge_mm.h | 181 ++++++++++++++++++---- mm/huge_memory.c | 231 +++++++++++++++++++++++++---- mm/khugepaged.c | 20 ++- mm/memory.c | 6 +- mm/page_vma_mapped.c | 3 +- 8 files changed, 459 insertions(+), 88 deletions(-) (limited to 'mm/memory.c') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index b0cc8243e093..04eb45a2f940 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -45,10 +45,25 @@ components: the two is using hugepages just because of the fact the TLB miss is going to run faster. +Modern kernels support "multi-size THP" (mTHP), which introduces the +ability to allocate memory in blocks that are bigger than a base page +but smaller than traditional PMD-size (as described above), in +increments of a power-of-2 number of pages. mTHP can back anonymous +memory (for example 16K, 32K, 64K, etc). These THPs continue to be +PTE-mapped, but in many cases can still provide similar benefits to +those outlined above: Page faults are significantly reduced (by a +factor of e.g. 4, 8, 16, etc), but latency spikes are much less +prominent because the size of each page isn't as huge as the PMD-sized +variant and there is less memory to clear in each page fault. Some +architectures also employ TLB compression mechanisms to squeeze more +entries in when a set of PTEs are virtually and physically contiguous +and approporiately aligned. In this case, TLB misses will occur less +often. + THP can be enabled system wide or restricted to certain tasks or even memory ranges inside task's address space. Unless THP is completely disabled, there is ``khugepaged`` daemon that scans memory and -collapses sequences of basic pages into huge pages. +collapses sequences of basic pages into PMD-sized huge pages. The THP behaviour is controlled via :ref:`sysfs ` interface and using madvise(2) and prctl(2) system calls. @@ -95,12 +110,40 @@ Global THP controls Transparent Hugepage Support for anonymous memory can be entirely disabled (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to avoid the risk of consuming more memory resources) or enabled -system wide. This can be achieved with one of:: +system wide. This can be achieved per-supported-THP-size with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + +where is the hugepage size being addressed, the available sizes +for which vary by system. + +For example:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + +Alternatively it is possible to specify that a given hugepage size +will inherit the top-level "enabled" value:: + + echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + +For example:: + + echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + +The top-level setting (for use with "inherit") can be set by issuing +one of the following commands:: echo always >/sys/kernel/mm/transparent_hugepage/enabled echo madvise >/sys/kernel/mm/transparent_hugepage/enabled echo never >/sys/kernel/mm/transparent_hugepage/enabled +By default, PMD-sized hugepages have enabled="inherit" and all other +hugepage sizes have enabled="never". If enabling multiple hugepage +sizes, the kernel will select the most appropriate enabled size for a +given allocation. + It's also possible to limit defrag efforts in the VM to generate anonymous hugepages in case they're not immediately free to madvise regions or to never try to defrag memory and simply fallback to regular @@ -146,25 +189,34 @@ madvise never should be self-explanatory. -By default kernel tries to use huge zero page on read page fault to -anonymous mapping. It's possible to disable huge zero page by writing 0 -or enable it back by writing 1:: +By default kernel tries to use huge, PMD-mappable zero page on read +page fault to anonymous mapping. It's possible to disable huge zero +page by writing 0 or enable it back by writing 1:: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page -Some userspace (such as a test program, or an optimized memory allocation -library) may want to know the size (in bytes) of a transparent hugepage:: +Some userspace (such as a test program, or an optimized memory +allocation library) may want to know the size (in bytes) of a +PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size -khugepaged will be automatically started when -transparent_hugepage/enabled is set to "always" or "madvise, and it'll -be automatically shutdown if it's set to "never". +khugepaged will be automatically started when one or more hugepage +sizes are enabled (either by directly setting "always" or "madvise", +or by setting "inherit" while the top-level enabled is set to "always" +or "madvise"), and it'll be automatically shutdown when the last +hugepage size is disabled (either by directly setting "never", or by +setting "inherit" while the top-level enabled is set to "never"). Khugepaged controls ------------------- +.. note:: + khugepaged currently only searches for opportunities to collapse to + PMD-sized THP and no attempt is made to collapse to other THP + sizes. + khugepaged runs usually at low frequency so while one may not want to invoke defrag algorithms synchronously during the page faults, it should be worth invoking defrag at least in khugepaged. However it's @@ -282,19 +334,26 @@ force Need of application restart =========================== -The transparent_hugepage/enabled values and tmpfs mount option only affect -future behavior. So to make them effective you need to restart any -application that could have been using hugepages. This also applies to the -regions registered in khugepaged. +The transparent_hugepage/enabled and +transparent_hugepage/hugepages-kB/enabled values and tmpfs mount +option only affect future behavior. So to make them effective you need +to restart any application that could have been using hugepages. This +also applies to the regions registered in khugepaged. Monitoring usage ================ -The number of anonymous transparent huge pages currently used by the +.. note:: + Currently the below counters only record events relating to + PMD-sized THP. Events relating to other THP sizes are not included. + +The number of PMD-sized anonymous transparent huge pages currently used by the system is available by reading the AnonHugePages field in ``/proc/meminfo``. -To identify what applications are using anonymous transparent huge pages, -it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields -for each mapping. +To identify what applications are using PMD-sized anonymous transparent huge +pages, it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages +fields for each mapping. (Note that AnonHugePages only applies to traditional +PMD-sized THP for historical reasons and should have been called +AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. @@ -413,7 +472,7 @@ for huge pages. Optimizing the applications =========================== -To be guaranteed that the kernel will map a 2M page immediately in any +To be guaranteed that the kernel will map a THP immediately in any memory region, the mmap region has to be hugepage naturally aligned. posix_memalign() can provide that guarantee. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 49ef12df631b..104c6d047d9b 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -528,9 +528,9 @@ replaced by copy-on-write) part of the underlying shmem object out on swap. does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. -"THPeligible" indicates whether the mapping is eligible for allocating THP -pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise. -It just shows the current status. +"THPeligible" indicates whether the mapping is eligible for allocating +naturally aligned THP pages of any currently enabled size. 1 if true, 0 +otherwise. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d19924bf0a39..79855e1c5b57 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -865,7 +865,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - hugepage_vma_check(vma, vma->vm_flags, true, false, true)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, + true, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa0350b0812a..609c153bae57 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -67,6 +67,24 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR)) + hpage_size >> PAGE_SHIFT)) return false; } - haddr = addr & HPAGE_PMD_MASK; + haddr = ALIGN_DOWN(addr, hpage_size); - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end) return false; return true; } +/* + * Filter the bitfield of input orders to the ones suitable for use in the vma. + * See thp_vma_suitable_order(). + * All orders that pass the checks are returned as a bitfield. + */ +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) +{ + int order; + + /* + * Iterate over orders, highest to lowest, removing orders that don't + * meet alignment requirements from the set. Exit loop at first order + * that meets requirements, since all lower orders must also meet + * requirements. + */ + + order = highest_order(orders); + + while (orders) { + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + return orders; +} + static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -130,8 +208,52 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs); +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders); + +/** + * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma + * @vma: the vm area to check + * @vm_flags: use these vm_flags instead of vma->vm_flags + * @smaps: whether answer will be used for smaps file + * @in_pf: whether answer will be used by page fault handler + * @enforce_sysfs: whether sysfs config should be taken into account + * @orders: bitfield of all orders to consider + * + * Calculates the intersection of the requested hugepage orders and the allowed + * hugepage orders for the provided vma. Permitted orders are encoded as a set + * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3 + * corresponds to order-3, etc). Order-0 is never considered a hugepage order. + * + * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage + * orders are allowed. + */ +static inline +unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Optimization to check if required orders are enabled early. */ + if (enforce_sysfs && vma_is_anonymous(vma)) { + unsigned long mask = READ_ONCE(huge_anon_orders_always); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_anon_orders_madvise); + if (hugepage_global_always() || + ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) + mask |= READ_ONCE(huge_anon_orders_inherit); + + orders &= mask; + if (!orders) + return 0; + } + + return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, + enforce_sysfs, orders); +} #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -267,17 +389,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long addr) +static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, + unsigned long addr, int order) { return false; } -static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs) +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) { - return false; + return 0; +} + +static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + return 0; } static inline void folio_prep_large_rmappable(struct folio *folio) {} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c848ea97ab02..387b030c7f15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -74,12 +74,23 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; +unsigned long huge_anon_orders_always __read_mostly; +unsigned long huge_anon_orders_madvise __read_mostly; +unsigned long huge_anon_orders_inherit __read_mostly; + +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Check the intersection of requested and supported orders. */ + orders &= vma_is_anonymous(vma) ? + THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (!orders) + return 0; -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs) -{ if (!vma->vm_mm) /* vdso */ - return false; + return 0; /* * Explicitly disabled through madvise or prctl, or some @@ -88,16 +99,16 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; + return 0; /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) - return false; + return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) - return in_pf; + return in_pf ? orders : 0; /* * khugepaged special VMA and hugetlb VMA. @@ -105,17 +116,29 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * VM_MIXEDMAP set. */ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) - return false; + return 0; /* - * Check alignment for file vma and size for both file and anon vma. + * Check alignment for file vma and size for both file and anon vma by + * filtering out the unsuitable orders. * * Skip the check for page fault. Huge fault does the check in fault - * handlers. And this check is not suitable for huge PUD fault. + * handlers. */ - if (!in_pf && - !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) - return false; + if (!in_pf) { + int order = highest_order(orders); + unsigned long addr; + + while (orders) { + addr = vma->vm_end - (PAGE_SIZE << order); + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + if (!orders) + return 0; + } /* * Enabled via shmem mount options or sysfs settings. @@ -124,29 +147,33 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, */ if (!in_pf && shmem_file(vma->vm_file)) return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags); - - /* Enforce sysfs THP requirements as necessary */ - if (enforce_sysfs && - (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && - !hugepage_flags_always()))) - return false; + !enforce_sysfs, vma->vm_mm, vm_flags) + ? orders : 0; if (!vma_is_anonymous(vma)) { + /* + * Enforce sysfs THP requirements as necessary. Anonymous vmas + * were already handled in thp_vma_allowable_orders(). + */ + if (enforce_sysfs && + (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_global_always()))) + return 0; + /* * Trust that ->huge_fault() handlers know what they are doing * in fault path. */ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) - return true; + return orders; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) - return true; - return false; + return orders; + return 0; } if (vma_is_temporary_stack(vma)) - return false; + return 0; /* * THPeligible bit of smaps should show 1 for proper VMAs even @@ -156,9 +183,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * the first page fault. */ if (!vma->anon_vma) - return (smaps || in_pf); + return (smaps || in_pf) ? orders : 0; - return true; + return orders; } static bool get_huge_zero_page(void) @@ -412,9 +439,136 @@ static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; +static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); +static void thpsize_release(struct kobject *kobj); +static DEFINE_SPINLOCK(huge_anon_orders_lock); +static LIST_HEAD(thpsize_list); + +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + +static ssize_t thpsize_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_anon_orders_always)) + output = "[always] inherit madvise never"; + else if (test_bit(order, &huge_anon_orders_inherit)) + output = "always [inherit] madvise never"; + else if (test_bit(order, &huge_anon_orders_madvise)) + output = "always inherit [madvise] never"; + else + output = "always inherit madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_inherit); + clear_bit(order, &huge_anon_orders_madvise); + set_bit(order, &huge_anon_orders_always); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_madvise); + set_bit(order, &huge_anon_orders_inherit); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "madvise")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_inherit); + set_bit(order, &huge_anon_orders_madvise); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_inherit); + clear_bit(order, &huge_anon_orders_madvise); + spin_unlock(&huge_anon_orders_lock); + } else + ret = -EINVAL; + + return ret; +} + +static struct kobj_attribute thpsize_enabled_attr = + __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); + +static struct attribute *thpsize_attrs[] = { + &thpsize_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group thpsize_attr_group = { + .attrs = thpsize_attrs, +}; + +static const struct kobj_type thpsize_ktype = { + .release = &thpsize_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static struct thpsize *thpsize_create(int order, struct kobject *parent) +{ + unsigned long size = (PAGE_SIZE << order) / SZ_1K; + struct thpsize *thpsize; + int ret; + + thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); + if (!thpsize) + return ERR_PTR(-ENOMEM); + + ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, + "hugepages-%lukB", size); + if (ret) { + kfree(thpsize); + return ERR_PTR(ret); + } + + ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + + thpsize->order = order; + return thpsize; +} + +static void thpsize_release(struct kobject *kobj) +{ + kfree(to_thpsize(kobj)); +} + static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; + struct thpsize *thpsize; + unsigned long orders; + int order; + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time + * constant so we have to do this here. + */ + huge_anon_orders_inherit = BIT(PMD_ORDER); *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -434,8 +588,24 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) goto remove_hp_group; } + orders = THP_ORDERS_ALL_ANON; + order = highest_order(orders); + while (orders) { + thpsize = thpsize_create(order, *hugepage_kobj); + if (IS_ERR(thpsize)) { + pr_err("failed to create thpsize for order %d\n", order); + err = PTR_ERR(thpsize); + goto remove_all; + } + list_add(&thpsize->node, &thpsize_list); + order = next_order(&orders, order); + } + return 0; +remove_all: + hugepage_exit_sysfs(*hugepage_kobj); + return err; remove_hp_group: sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); delete_obj: @@ -445,6 +615,13 @@ delete_obj: static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) { + struct thpsize *thpsize, *tmp; + + list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { + list_del(&thpsize->node); + kobject_put(&thpsize->kobj); + } + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); kobject_put(hugepage_kobj); @@ -811,7 +988,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - if (!transhuge_vma_suitable(vma, haddr)) + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 064654717843..d72aecd3624a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -446,7 +446,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false, true)) + if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -922,16 +923,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - if (!transhuge_vma_suitable(vma, address)) + if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, - cc->is_khugepaged)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, + cc->is_khugepaged, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * - * hugepage_vma_check may return true for qualified file + * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) @@ -1503,7 +1504,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, + PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2368,7 +2370,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, + true, PMD_ORDER)) { skip: progress++; continue; @@ -2705,7 +2708,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, + PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 99582b188ed2..8ab2d994d997 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4322,7 +4322,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; - if (!transhuge_vma_suitable(vma, haddr)) + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; page = compound_head(page); @@ -5116,7 +5116,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5150,7 +5150,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e0b368e545ed..74d2de15fb5e 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -268,7 +268,8 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - transhuge_vma_suitable(vma, pvmw->address) && + thp_vma_suitable_order(vma, pvmw->address, + PMD_ORDER) && (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); -- cgit v1.2.3 From 19eaf44954df64f9bc8dec398219e15ad0811497 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:05 +0000 Subject: mm: thp: support allocation of anonymous multi-size THP Introduce the logic to allow THP to be configured (through the new sysfs interface we just added) to allocate large folios to back anonymous memory, which are larger than the base page size but smaller than PMD-size. We call this new THP extension "multi-size THP" (mTHP). mTHP continues to be PTE-mapped, but in many cases can still provide similar benefits to traditional PMD-sized THP: Page faults are significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on the configured order), but latency spikes are much less prominent because the size of each page isn't as huge as the PMD-sized variant and there is less memory to clear in each page fault. The number of per-page operations (e.g. ref counting, rmap management, lru list management) are also significantly reduced since those ops now become per-folio. Some architectures also employ TLB compression mechanisms to squeeze more entries in when a set of PTEs are virtually and physically contiguous and approporiately aligned. In this case, TLB misses will occur less often. The new behaviour is disabled by default, but can be enabled at runtime by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled (see documentation in previous commit). The long term aim is to change the default to include suitable lower orders, but there are some risks around internal fragmentation that need to be better understood first. [ryan.roberts@arm.com: resolve some multi-size THP review nits] Link: https://lkml.kernel.org/r/20231214160251.3574571-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++- mm/memory.c | 109 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 11 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 609c153bae57..fa7a38a30fc6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1<vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * for this vma. Then filter out the orders that can't be allocated over + * the faulting address and still be fully contained in the vma. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, + BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); + if (!pte) + return ERR_PTR(-EAGAIN); + + /* + * Find the highest order where the aligned range is completely + * pte_none(). Note that all remaining orders will be completely + * pte_none(). + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (pte_range_none(pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap(pte); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr, true); + if (folio) { + clear_huge_page(&folio->page, vmf->address, 1 << order); + return folio; + } + order = next_order(&orders, order); + } + +fallback: +#endif + return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4134,9 +4212,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; + int nr_pages = 1; pte_t entry; + int i; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -4176,10 +4257,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ + folio = alloc_anon_folio(vmf); + if (IS_ERR(folio)) + return 0; if (!folio) goto oom; + nr_pages = folio_nr_pages(folio); + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; folio_throttle_swaprate(folio, GFP_KERNEL); @@ -4196,12 +4283,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; - if (vmf_pte_changed(vmf)) { - update_mmu_tlb(vma, vmf->address, vmf->pte); + if (nr_pages == 1 && vmf_pte_changed(vmf)) { + update_mmu_tlb(vma, addr, vmf->pte); + goto release; + } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { + for (i = 0; i < nr_pages; i++) + update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); goto release; } @@ -4216,16 +4306,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_ref_add(folio, nr_pages - 1); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: if (uffd_wp) entry = pte_mkuffd_wp(entry); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); -- cgit v1.2.3 From f7ef5fe74aaf634f2fe50e0f3339a405cced5d01 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 14 Dec 2023 09:10:04 +0100 Subject: mm/memory: replace kmap() with kmap_local_page() kmap() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap() with kmap_local_page() in mm/memory.c. There are two main problems with kmap(): (1) It comes with an overhead as the mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page-faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. The tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Obviously, thread locality implies that the kernel virtual addresses returned by kmap_local_page() are only valid in the context of the callers (i.e., they cannot be handed to other threads). The use of kmap_local_page() in mm/memory.c does not break the above-mentioned assumption, so it is allowed and preferred. Link: https://lkml.kernel.org/r/20231215084417.2002370-1-fabio.maria.de.francesco@linux.intel.com Link: https://lkml.kernel.org/r/20231214081039.1919328-1-fabio.maria.de.francesco@linux.intel.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton --- mm/memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 3c530b639559..b9cc56a75f4b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5944,7 +5944,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; - maddr = kmap(page); + maddr = kmap_local_page(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); @@ -5953,8 +5953,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } - kunmap(page); - put_page(page); + unmap_and_put_page(page, maddr); } len -= bytes; buf += bytes; -- cgit v1.2.3 From 96db66d9c8f3c1547325af01b1f328b85d6ee1b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:06 +0000 Subject: mm: convert ksm_might_need_to_copy() to work on folios Patch series "Finish two folio conversions". Most callers of page_add_new_anon_rmap() and lru_cache_add_inactive_or_unevictable() have been converted to their folio equivalents, but there are still a few stragglers. There's a bit of preparatory work in ksm and unuse_pte(), but after that it's pretty mechanical. This patch (of 9): Accept a folio as an argument and return a folio result. Removes a call to compound_head() in do_swap_page(), and prevents folio & page from getting out of sync in unuse_pte(). Reviewed-by: David Hildenbrand [willy@infradead.org: fix smatch warning] Link: https://lkml.kernel.org/r/ZXnPtblC6A1IkyAB@casper.infradead.org [david@redhat.com: only adjust the page if the folio changed] Link: https://lkml.kernel.org/r/6a8f2110-fa91-4c10-9eae-88315309a6e3@redhat.com Link: https://lkml.kernel.org/r/20231211162214.2146080-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231211162214.2146080-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/ksm.h | 6 +++--- mm/ksm.c | 21 +++++++++++---------- mm/memory.c | 11 +++++++---- mm/swapfile.c | 8 +++++--- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 4643d5244e77..401348e9f92b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -76,7 +76,7 @@ static inline void ksm_exit(struct mm_struct *mm) * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); @@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -static inline struct page *ksm_might_need_to_copy(struct page *page, +static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - return page; + return folio; } static inline void rmap_walk_ksm(struct folio *folio, diff --git a/mm/ksm.c b/mm/ksm.c index c0e1995fb444..e2ce850c2739 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2873,30 +2873,30 @@ void __ksm_exit(struct mm_struct *mm) trace_ksm_exit(mm); } -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = page_folio(page); + struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) - return page; + return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) - return page; /* no need to copy it */ + return folio; /* no need to copy it */ } else if (!anon_vma) { - return page; /* no need to copy it */ + return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { - return page; /* still no need to copy it */ + return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) - return page; /* let do_swap_page report the error */ + return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); if (new_folio && @@ -2905,9 +2905,10 @@ struct page *ksm_might_need_to_copy(struct page *page, new_folio = NULL; } if (new_folio) { - if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) { + if (copy_mc_user_highpage(folio_page(new_folio, 0), page, + addr, vma)) { folio_put(new_folio); - memory_failure_queue(page_to_pfn(page), 0); + memory_failure_queue(folio_pfn(folio), 0); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); @@ -2918,7 +2919,7 @@ struct page *ksm_might_need_to_copy(struct page *page, #endif } - return new_folio ? &new_folio->page : NULL; + return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) diff --git a/mm/memory.c b/mm/memory.c index b9cc56a75f4b..7649cb9eb7f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3942,15 +3942,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { + folio = ksm_might_need_to_copy(folio, vma, vmf->address); + if (unlikely(!folio)) { ret = VM_FAULT_OOM; + folio = swapcache; goto out_page; - } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) { + } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; + folio = swapcache; goto out_page; } - folio = page_folio(page); + if (folio != swapcache) + page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we diff --git a/mm/swapfile.c b/mm/swapfile.c index 8be70912e298..0371b7b3cd27 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1749,11 +1749,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, int ret = 1; swapcache = page; - page = ksm_might_need_to_copy(page, vma, addr); - if (unlikely(!page)) + folio = ksm_might_need_to_copy(folio, vma, addr); + if (unlikely(!folio)) return -ENOMEM; - else if (unlikely(PTR_ERR(page) == -EHWPOISON)) + else if (unlikely(folio == ERR_PTR(-EHWPOISON))) hwpoisoned = true; + else + page = folio_file_page(folio, swp_offset(entry)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), -- cgit v1.2.3 From 2853b66b601a265306be709b4d86aaff7d92a0fc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:09 +0000 Subject: mm: remove some calls to page_add_new_anon_rmap() We already have the folio in these functions, we just need to use it. folio_add_new_anon_rmap() didn't exist at the time they were converted to folios. Link: https://lkml.kernel.org/r/20231211162214.2146080-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- mm/memory.c | 2 +- mm/userfaultfd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 435aac1d8c27..8b115fc43f04 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); - page_add_new_anon_rmap(new_page, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/memory.c b/mm/memory.c index 7649cb9eb7f5..5c023bef2adb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4071,7 +4071,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9ec814e47e99..203cda9192c2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -116,7 +116,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); page_add_file_rmap(page, dst_vma, false); } else { - page_add_new_anon_rmap(page, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr); folio_add_lru_vma(folio, dst_vma); } -- cgit v1.2.3 From c9bdf768dd9319d2d80a334646e2c8116af9e430 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:39 +0000 Subject: mm: convert swap_readpage() to swap_read_folio() All callers have a folio, so pass it in, saving two calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- mm/page_io.c | 18 +++++++++--------- mm/swap.h | 5 +++-- mm/swap_state.c | 12 ++++++------ mm/swapfile.c | 2 +- 5 files changed, 21 insertions(+), 20 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5c023bef2adb..cfcaf4c0198c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3882,9 +3882,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_lru(folio); - /* To provide entry to swap_readpage() */ + /* To provide entry to swap_read_folio() */ folio->swap = entry; - swap_readpage(page, true, NULL); + swap_read_folio(folio, true, NULL); folio->private = NULL; } } else { diff --git a/mm/page_io.c b/mm/page_io.c index 6736c56526bf..09c6a4f316f3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -420,7 +420,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug) +static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_iocb *sio = NULL; @@ -454,7 +454,7 @@ static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug) *plug = sio; } -static void swap_readpage_bdev_sync(struct folio *folio, +static void swap_read_folio_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; @@ -474,7 +474,7 @@ static void swap_readpage_bdev_sync(struct folio *folio, put_task_struct(current); } -static void swap_readpage_bdev_async(struct folio *folio, +static void swap_read_folio_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; @@ -487,10 +487,10 @@ static void swap_readpage_bdev_async(struct folio *folio, submit_bio(bio); } -void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +void swap_read_folio(struct folio *folio, bool synchronous, + struct swap_iocb **plug) { - struct folio *folio = page_folio(page); - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; @@ -514,11 +514,11 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { - swap_readpage_fs(folio, plug); + swap_read_folio_fs(folio, plug); } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { - swap_readpage_bdev_sync(folio, sis); + swap_read_folio_bdev_sync(folio, sis); } else { - swap_readpage_bdev_async(folio, sis); + swap_read_folio_bdev_async(folio, sis); } if (workingset) { diff --git a/mm/swap.h b/mm/swap.h index b81587740cf1..859ae8f0fd2d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -10,7 +10,8 @@ struct mempolicy; /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; -void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug); +void swap_read_folio(struct folio *folio, bool do_poll, + struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -63,7 +64,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) } #else /* CONFIG_SWAP */ struct swap_iocb; -static inline void swap_readpage(struct page *page, bool do_poll, +static inline void swap_read_folio(struct folio *folio, bool do_poll, struct swap_iocb **plug) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 874b40a1f502..d2fe70e307d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -530,7 +530,7 @@ fail_put_swap: * the swap entry is no longer in use. * * get/put_swap_device() aren't needed to call this function, because - * __read_swap_cache_async() call them and swap_readpage() holds the + * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -548,7 +548,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol_cond_put(mpol); if (page_allocated) - swap_readpage(&folio->page, false, plug); + swap_read_folio(folio, false, plug); return folio_file_page(folio, swp_offset(entry)); } @@ -665,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_readpage(&folio->page, false, &splug); + swap_read_folio(folio, false, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -681,7 +681,7 @@ skip: folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(&folio->page, false, NULL); + swap_read_folio(folio, false, NULL); zswap_folio_swapin(folio); return folio_file_page(folio, swp_offset(entry)); } @@ -839,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_readpage(&folio->page, false, &splug); + swap_read_folio(folio, false, &splug); if (i != ra_info.offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -857,7 +857,7 @@ skip: folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(&folio->page, false, NULL); + swap_read_folio(folio, false, NULL); zswap_folio_swapin(folio); return folio_file_page(folio, swp_offset(entry)); } diff --git a/mm/swapfile.c b/mm/swapfile.c index b22c47b11d65..f3e23a3d26ae 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2225,7 +2225,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is - * built at swapon time and is then used at swap_writepage/swap_readpage + * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. -- cgit v1.2.3 From 68f0320824fa59c5429cbc811e6c46e7a30ea32c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:31 +0100 Subject: mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]() Let's get rid of the compound parameter and instead define explicitly which mappings we're adding. That is more future proof, easier to read and harder to mess up. Use an enum to express the granularity internally. Make the compiler always special-case on the granularity by using __always_inline. Replace the "compound" check by a switch-case that will be removed by the compiler completely. Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the folio_test_pmd_mappable() check by a config check in the caller and sanity checks. Convert the single user of folio_add_file_rmap_range(). While at it, consistently use "int" instead of "unisgned int" in rmap code when dealing with mapcounts and the number of pages. This function design can later easily be extended to PUDs and to batch PMDs. Note that for now we don't support anything bigger than PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks will catch if that ever changes. Next up is removing page_remove_rmap() along with its "compound" parameter and smilarly converting all other rmap functions. Link: https://lkml.kernel.org/r/20231220224504.646757-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 46 ++++++++++++++++++++++++++++-- mm/memory.c | 2 +- mm/rmap.c | 79 +++++++++++++++++++++++++++++++++------------------- 3 files changed, 95 insertions(+), 32 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d6fefa0f0410..3d86a76b2836 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -191,6 +191,44 @@ typedef int __bitwise rmap_t; */ #define RMAP_COMPOUND ((__force rmap_t)BIT(1)) +/* + * Internally, we're using an enum to specify the granularity. We make the + * compiler emit specialized code for each granularity. + */ +enum rmap_level { + RMAP_LEVEL_PTE = 0, + RMAP_LEVEL_PMD, +}; + +static inline void __folio_rmap_sanity_checks(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + /* hugetlb folios are handled separately. */ + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(folio_test_large(folio) && + !folio_test_large_rmappable(folio), folio); + + VM_WARN_ON_ONCE(nr_pages <= 0); + VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); + VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); + + switch (level) { + case RMAP_LEVEL_PTE: + break; + case RMAP_LEVEL_PMD: + /* + * We don't support folios larger than a single PMD yet. So + * when RMAP_LEVEL_PMD is set, we assume that we are creating + * a single "entire" mapping of the folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); + break; + default: + VM_WARN_ON_ONCE(true); + } +} + /* * rmap interfaces called when adding or removing pte of page */ @@ -201,8 +239,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); -void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, - struct vm_area_struct *, bool compound); +void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_add_file_rmap_pte(folio, page, vma) \ + folio_add_file_rmap_ptes(folio, page, 1, vma) +void folio_add_file_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); diff --git a/mm/memory.c b/mm/memory.c index cfcaf4c0198c..9b977b2cf893 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4516,7 +4516,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, folio_add_lru_vma(folio, vma); } else { add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); - folio_add_file_rmap_range(folio, page, nr, vma, false); + folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); diff --git a/mm/rmap.c b/mm/rmap.c index 6a1829324053..cc1fc2d570f0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1378,31 +1378,18 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); } -/** - * folio_add_file_rmap_range - add pte mapping to page range of a folio - * @folio: The folio to add the mapping to - * @page: The first page to add - * @nr_pages: The number of pages which will be mapped - * @vma: the vm area in which the mapping is added - * @compound: charge the page as compound or small page - * - * The page range of folio is defined by [first_page, first_page + nr_pages) - * - * The caller needs to hold the pte lock. - */ -void folio_add_file_rmap_range(struct folio *folio, struct page *page, - unsigned int nr_pages, struct vm_area_struct *vma, - bool compound) +static __always_inline void __folio_add_file_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *vma, + enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; - unsigned int nr_pmdmapped = 0, first; - int nr = 0; + int nr = 0, nr_pmdmapped = 0, first; - VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); - VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); + VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); - /* Is page being mapped by PTE? Is this its first map to be added? */ - if (likely(!compound)) { + switch (level) { + case RMAP_LEVEL_PTE: do { first = atomic_inc_and_test(&page->_mapcount); if (first && folio_test_large(folio)) { @@ -1413,9 +1400,8 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, if (first) nr++; } while (page++, --nr_pages > 0); - } else if (folio_test_pmd_mappable(folio)) { - /* That test is redundant: it's for safety or to optimize out */ - + break; + case RMAP_LEVEL_PMD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); @@ -1430,6 +1416,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, nr = 0; } } + break; } if (nr_pmdmapped) @@ -1443,6 +1430,43 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, mlock_vma_folio(folio, vma); } +/** + * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio + * @folio: The folio to add the mappings to + * @page: The first page to add + * @nr_pages: The number of pages that will be mapped using PTEs + * @vma: The vm area in which the mappings are added + * + * The page range of the folio is defined by [page, page + nr_pages) + * + * The caller needs to hold the page table lock. + */ +void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, + int nr_pages, struct vm_area_struct *vma) +{ + __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); +} + +/** + * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @vma: The vm area in which the mapping is added + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); +#endif +} + /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to @@ -1455,16 +1479,13 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { struct folio *folio = page_folio(page); - unsigned int nr_pages; VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); if (likely(!compound)) - nr_pages = 1; + folio_add_file_rmap_pte(folio, page, vma); else - nr_pages = folio_nr_pages(folio); - - folio_add_file_rmap_range(folio, page, nr_pages, vma, compound); + folio_add_file_rmap_pmd(folio, page, vma); } /** -- cgit v1.2.3 From ef37b2ea08ace7b5fbcd569d703be1903afd12f9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:32 +0100 Subject: mm/memory: page_add_file_rmap() -> folio_add_file_rmap_[pte|pmd]() Let's convert insert_page_into_pte_locked() and do_set_pmd(). While at it, perform some folio conversion. Link: https://lkml.kernel.org/r/20231220224504.646757-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/memory.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 9b977b2cf893..0e5e069aaec4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1859,12 +1859,14 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { + struct folio *folio = page_folio(page); + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ - get_page(page); + folio_get(folio); inc_mm_counter(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, vma, false); + folio_add_file_rmap_pte(folio, page, vma); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -4410,6 +4412,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { + struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -4419,8 +4422,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; - page = compound_head(page); - if (compound_order(page) != HPAGE_PMD_ORDER) + if (page != &folio->page || folio_order(folio) != HPAGE_PMD_ORDER) return ret; /* @@ -4429,7 +4431,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * check. This kind of THP just can be PTE mapped. Access to * the corrupted subpage should trigger SIGBUS as expected. */ - if (unlikely(PageHasHWPoisoned(page))) + if (unlikely(folio_test_has_hwpoisoned(folio))) return ret; /* @@ -4453,7 +4455,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, vma, true); + folio_add_file_rmap_pmd(folio, page, vma); /* * deposit and withdraw with pmd lock held -- cgit v1.2.3 From b832a354d787bfbdea5c226f0d77cc1a222d09f8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:44 +0100 Subject: mm/memory: page_add_anon_rmap() -> folio_add_anon_rmap_pte() Let's convert restore_exclusive_pte() and do_swap_page(). While at it, perform some folio conversion in restore_exclusive_pte(). Link: https://lkml.kernel.org/r/20231220224504.646757-21-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0e5e069aaec4..e84917c118ac 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -710,6 +710,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + struct folio *folio = page_folio(page); pte_t orig_pte; pte_t pte; swp_entry_t entry; @@ -725,14 +726,15 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); - VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page))); + VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) && + PageAnonExclusive(page)), folio); /* * No need to take a page reference as one was already * created when the swap entry was made. */ - if (PageAnon(page)) - page_add_anon_rmap(page, vma, address, RMAP_NONE); + if (folio_test_anon(folio)) + folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE); else /* * Currently device exclusive access only supports anonymous @@ -4076,7 +4078,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_new_anon_rmap(folio, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { - page_add_anon_rmap(page, vma, vmf->address, rmap_flags); + folio_add_anon_rmap_pte(folio, page, vma, vmf->address, + rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || -- cgit v1.2.3 From c46265030b0f400ef89833bb51da62676d2f855a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:52 +0100 Subject: mm/memory: page_remove_rmap() -> folio_remove_rmap_pte() Let's convert zap_pte_range() and closely-related tlb_flush_rmap_batch(). While at it, perform some more folio conversion in zap_pte_range(). Link: https://lkml.kernel.org/r/20231220224504.646757-29-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 23 +++++++++++++---------- mm/mmu_gather.c | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e84917c118ac..e52c6e97444a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1434,6 +1434,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, arch_enter_lazy_mmu_mode(); do { pte_t ptent = ptep_get(pte); + struct folio *folio; struct page *page; if (pte_none(ptent)) @@ -1459,21 +1460,22 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; } + folio = page_folio(page); delay_rmap = 0; - if (!PageAnon(page)) { + if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { - set_page_dirty(page); + folio_set_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = 1; force_flush = 1; } } if (pte_young(ptent) && likely(vma_has_recency(vma))) - mark_page_accessed(page); + folio_mark_accessed(folio); } rss[mm_counter(page)]--; if (!delay_rmap) { - page_remove_rmap(page, vma, false); + folio_remove_rmap_pte(folio, page, vma); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); } @@ -1489,6 +1491,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); + folio = page_folio(page); if (unlikely(!should_zap_page(details, page))) continue; /* @@ -1500,8 +1503,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(page)]--; if (is_device_private_entry(entry)) - page_remove_rmap(page, vma, false); - put_page(page); + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); } else if (!non_swap_entry(entry)) { /* Genuine swap entry, hence a private anon page */ if (!should_zap_cows(details)) @@ -3220,10 +3223,10 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * threads. * * The critical issue is to order this - * page_remove_rmap with the ptp_clear_flush above. - * Those stores are ordered by (if nothing else,) + * folio_remove_rmap_pte() with the ptp_clear_flush + * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative - * in page_remove_rmap. + * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the @@ -3232,7 +3235,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(vmf->page, vma, false); + folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 4f559f4ddd21..604ddf08affe 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -55,7 +55,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_ if (encoded_page_flags(enc)) { struct page *page = encoded_page_ptr(enc); - page_remove_rmap(page, vma, false); + folio_remove_rmap_pte(page_folio(page), page, vma); } } } -- cgit v1.2.3 From d8ef5e311d7bfde54b60ab45026f206eff31b2d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:57 +0100 Subject: mm/rmap: convert page_dup_file_rmap() to folio_dup_file_rmap_[pte|ptes|pmd]() Let's convert page_dup_file_rmap() like the other rmap functions. As there is only a single caller, convert that single caller right away and remove page_dup_file_rmap(). Add folio_dup_file_rmap_ptes() right away, we want to perform rmap baching during fork() soon. Link: https://lkml.kernel.org/r/20231220224504.646757-34-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++----- mm/memory.c | 2 +- 2 files changed, 55 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fef369e37039..7607f862e795 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -308,6 +308,60 @@ static inline void hugetlb_remove_rmap(struct folio *folio) atomic_dec(&folio->_entire_mapcount); } +static __always_inline void __folio_dup_file_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + __folio_rmap_sanity_checks(folio, page, nr_pages, level); + + switch (level) { + case RMAP_LEVEL_PTE: + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + atomic_inc(&folio->_entire_mapcount); + break; + } +} + +/** + * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated + * + * The page range of the folio is defined by [page, page + nr_pages) + * + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages) +{ + __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); +} +#define folio_dup_file_rmap_pte(folio, page) \ + folio_dup_file_rmap_ptes(folio, page, 1) + +/** + * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_pmd(struct folio *folio, + struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); +#else + WARN_ON_ONCE(true); +#endif +} + static inline void __page_dup_rmap(struct page *page, bool compound) { VM_WARN_ON(folio_test_hugetlb(page_folio(page))); @@ -322,11 +376,6 @@ static inline void __page_dup_rmap(struct page *page, bool compound) } } -static inline void page_dup_file_rmap(struct page *page, bool compound) -{ - __page_dup_rmap(page, compound); -} - /** * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped * anonymous page diff --git a/mm/memory.c b/mm/memory.c index e52c6e97444a..fdc87bf15545 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -965,7 +965,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, rss[MM_ANONPAGES]++; } else if (page) { folio_get(folio); - page_dup_file_rmap(page, false); + folio_dup_file_rmap_pte(folio, page); rss[mm_counter_file(page)]++; } -- cgit v1.2.3 From 08e7795e2444c3df9292f4ac7092be6168166a46 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:45:00 +0100 Subject: mm/memory: page_try_dup_anon_rmap() -> folio_try_dup_anon_rmap_pte() Let's convert copy_nonpresent_pte(). While at it, perform some more folio conversion. Link: https://lkml.kernel.org/r/20231220224504.646757-37-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index fdc87bf15545..d66559cd55db 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -785,6 +785,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; + struct folio *folio; struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); @@ -829,6 +830,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } } else if (is_device_private_entry(entry)) { page = pfn_swap_entry_to_page(entry); + folio = page_folio(page); /* * Update rss count even for unaddressable pages, as @@ -839,10 +841,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * for unaddressable pages, at some point. But for now * keep things as they are. */ - get_page(page); + folio_get(folio); rss[mm_counter(page)]++; /* Cannot fail as these pages cannot get pinned. */ - BUG_ON(page_try_dup_anon_rmap(page, false, src_vma)); + folio_try_dup_anon_rmap_pte(folio, page, src_vma); /* * We do not preserve soft-dirty information, because so @@ -956,7 +958,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, * future. */ folio_get(folio); - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { + if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, -- cgit v1.2.3 From e99fb98d478a0480d50e334df21bef12fb74e17f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 22 Dec 2023 15:02:03 +0800 Subject: mm: remove unnecessary ia64 code and comment IA64 has gone with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"), remove unnecessary ia64 special mm code and comment too. Link: https://lkml.kernel.org/r/20231222070203.2966980-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/Kconfig | 2 +- mm/memory.c | 4 +--- mm/mm_init.c | 48 +++++++++++++++++++----------------------------- mm/page_owner.c | 1 - 4 files changed, 21 insertions(+), 34 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/Kconfig b/mm/Kconfig index 8f8b02e9c136..b072664b889a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -770,7 +770,7 @@ config DEFAULT_MMAP_MIN_ADDR from userspace allocation. Keeping a user from writing to low pages can help reduce the impact of kernel NULL pointer bugs. - For most ia64, ppc64 and x86 users with lots of address space + For most ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. Programs which use vm86 functionality or have some need to map diff --git a/mm/memory.c b/mm/memory.c index d66559cd55db..a0a50d3754f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -123,9 +123,7 @@ static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and - * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL - * and ZONE_HIGHMEM. + * of ZONE_NORMAL. */ void *high_memory; EXPORT_SYMBOL(high_memory); diff --git a/mm/mm_init.c b/mm/mm_init.c index a5f91eba4f8d..2830eef2b16c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1467,8 +1467,7 @@ void __init set_pageblock_order(void) /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 and - * powerpc. + * This value may be variable depending on boot parameters on powerpc. */ pageblock_order = order; } @@ -1629,8 +1628,8 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, #ifdef CONFIG_FLATMEM static void __init alloc_node_mem_map(struct pglist_data *pgdat) { - unsigned long __maybe_unused start = 0; - unsigned long __maybe_unused offset = 0; + unsigned long start, offset, size, end; + struct page *map; /* Skip empty nodes */ if (!pgdat->node_spanned_pages) @@ -1638,33 +1637,24 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; - /* ia64 gets its own node_mem_map, before this, without bootmem */ - if (!pgdat->node_mem_map) { - unsigned long size, end; - struct page *map; - - /* - * The zone's endpoints aren't required to be MAX_ORDER - * aligned but the node_mem_map endpoints must be in order - * for the buddy allocator to function correctly. - */ - end = pgdat_end_pfn(pgdat); - end = ALIGN(end, MAX_ORDER_NR_PAGES); - size = (end - start) * sizeof(struct page); - map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, - pgdat->node_id, false); - if (!map) - panic("Failed to allocate %ld bytes for node %d memory map\n", - size, pgdat->node_id); - pgdat->node_mem_map = map + offset; - } - pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", - __func__, pgdat->node_id, (unsigned long)pgdat, - (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NUMA /* - * With no DISCONTIG, the global mem_map is just set as node 0's + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. */ + end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); + pgdat->node_mem_map = map + offset; + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#ifndef CONFIG_NUMA + /* the global mem_map is just set as node 0's */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; if (page_to_pfn(mem_map) != pgdat->node_start_pfn) diff --git a/mm/page_owner.c b/mm/page_owner.c index e7eba7688881..040dbf26a986 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -121,7 +121,6 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) * Sometimes page metadata allocation tracking requires more * memory to be allocated: * - when new stack trace is saved to stack depot - * - when backtrace itself is calculated (ia64) */ if (current->in_page_owner) return dummy_handle; -- cgit v1.2.3