From 37d17925480404f1293f24d027fbf3c9975603d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Sep 2025 11:46:43 +0100 Subject: [PATCH 001/321] mm/thp: drop follow_devmap_pmd() default stub follow_devmap_pmd() has already been dropped by the commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). The fallback stub in the header which is now redundant, can be dropped off as well. Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Dev Jain Reviewed-by: Alistair Popple Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834..fee4cf7fa300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -682,12 +682,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; From b6c46600bfb28b4be4e9cff7bad4f2cf357e0fb7 Mon Sep 17 00:00:00 2001 From: "jianyun.gao" Date: Mon, 29 Sep 2025 08:26:08 +0800 Subject: [PATCH 002/321] mm: fix some typos in mm module Below are some typos in the code comments: intevals ==> intervals addesses ==> addresses unavaliable ==> unavailable facor ==> factor droping ==> dropping exlusive ==> exclusive decription ==> description confict ==> conflict desriptions ==> descriptions otherwize ==> otherwise vlaue ==> value cheching ==> checking exisitng ==> existing modifed ==> modified differenciate ==> differentiate refernece ==> reference permissons ==> permissions indepdenent ==> independent spliting ==> splitting Just fix it. Link: https://lkml.kernel.org/r/20250929002608.1633825-1-jianyungao89@gmail.com Signed-off-by: jianyun.gao Reviewed-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Dev Jain Reviewed-by: Liam R. Howlett Acked-by: Chris Li Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 2 +- mm/gup.c | 2 +- mm/hugetlb.c | 6 +++--- mm/hugetlb_vmemmap.c | 6 +++--- mm/kmsan/core.c | 2 +- mm/ksm.c | 2 +- mm/memory-tiers.c | 2 +- mm/memory.c | 4 ++-- mm/secretmem.c | 2 +- mm/slab_common.c | 2 +- mm/slub.c | 2 +- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- mm/vma.c | 4 ++-- 14 files changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 3c0d727788c8..0ecd8fb84101 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1264,7 +1264,7 @@ enum damon_sysfs_cmd { DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS, /* * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring - * intevals. + * intervals. */ DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS, /* diff --git a/mm/gup.c b/mm/gup.c index a8ba5112e4d0..d2524fe09338 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2710,7 +2710,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * *) ptes can be read atomically by the architecture. * - * *) valid user addesses are below TASK_MAX_SIZE + * *) valid user addresses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0455119716ec..4e016433e32e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2934,7 +2934,7 @@ typedef enum { * NOTE: This is mostly identical to MAP_CHG_NEEDED, except * that currently vma_needs_reservation() has an unwanted side * effect to either use end() or commit() to complete the - * transaction. Hence it needs to differenciate from NEEDED. + * transaction. Hence it needs to differentiate from NEEDED. */ MAP_CHG_ENFORCED = 2, } map_chg_state; @@ -6007,7 +6007,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We * could defer the flush until now, since by holding i_mmap_rwsem we - * guaranteed that the last refernece would not be dropped. But we must + * guaranteed that the last reference would not be dropped. But we must * do the flushing before we return, as otherwise i_mmap_rwsem will be * dropped and the last reference to the shared PMDs page might be * dropped as well. @@ -7193,7 +7193,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, } else if (unlikely(is_pte_marker(pte))) { /* * Do nothing on a poison marker; page is - * corrupted, permissons do not apply. Here + * corrupted, permissions do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba0fb1b6a5a8..96ee2bd16ee1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -75,7 +75,7 @@ static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to - * be treated as indepdenent small pages (as they can be freed + * be treated as independent small pages (as they can be freed * individually). */ if (!PageReserved(head)) @@ -684,7 +684,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, ret = hugetlb_vmemmap_split_folio(h, folio); /* - * Spliting the PMD requires allocating a page, thus lets fail + * Splitting the PMD requires allocating a page, thus let's fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. @@ -715,7 +715,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. - * This can occur in the case that both spliting fails + * This can occur in the case that both splitting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 35ceaa8adb41..90f427b95a21 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -33,7 +33,7 @@ bool kmsan_enabled __read_mostly; /* * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is - * unavaliable. + * unavailable. */ DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); diff --git a/mm/ksm.c b/mm/ksm.c index c4e730409949..cdefba633856 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -389,7 +389,7 @@ static unsigned long ewma(unsigned long prev, unsigned long curr) * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * - * new_pages_to_scan *= change facor + * new_pages_to_scan *= change factor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0ea5c13f10a2..864811fff409 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -519,7 +519,7 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem * for each device getting added in the same NUMA node * with this specific memtype, bump the map count. We * Only take memtype device reference once, so that - * changing a node memtype can be done by droping the + * changing a node memtype can be done by dropping the * only reference count taken here. */ diff --git a/mm/memory.c b/mm/memory.c index b59ae7ce42eb..61748b762876 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4328,7 +4328,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache - * reference only in case it's likely that we'll be the exlusive user. + * reference only in case it's likely that we'll be the exclusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); @@ -5405,7 +5405,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa /** * set_pte_range - Set a range of PTEs to point to pages in a folio. - * @vmf: Fault decription. + * @vmf: Fault description. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. diff --git a/mm/secretmem.c b/mm/secretmem.c index b59350daffe3..9b0f5d9ec6f4 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -227,7 +227,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) struct file *file; int fd, err; - /* make sure local flags do not confict with global fcntl.h */ + /* make sure local flags do not conflict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) diff --git a/mm/slab_common.c b/mm/slab_common.c index 932d13ada36c..d2824daa98cf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -259,7 +259,7 @@ out: * @object_size: The size of objects to be created in this cache. * @args: Additional arguments for the cache creation (see * &struct kmem_cache_args). - * @flags: See the desriptions of individual flags. The common ones are listed + * @flags: See the descriptions of individual flags. The common ones are listed * in the description below. * * Not to be called directly, use the kmem_cache_create() wrapper with the same diff --git a/mm/slub.c b/mm/slub.c index 1bf65c421325..927ca64b6cbe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2533,7 +2533,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, memset((char *)kasan_reset_tag(x) + inuse, 0, s->size - inuse - rsize); /* - * Restore orig_size, otherwize kmalloc redzone overwritten + * Restore orig_size, otherwise kmalloc redzone overwritten * would be reported */ set_orig_size(s, x, orig_size); diff --git a/mm/swapfile.c b/mm/swapfile.c index 10760240a3a2..cb2392ed8e0e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1677,7 +1677,7 @@ static bool swap_entries_put_map_nr(struct swap_info_struct *si, /* * Check if it's the last ref of swap entry in the freeing path. - * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ static inline bool __maybe_unused swap_is_last_ref(unsigned char count) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af61b95c89e4..0630f188c847 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1578,7 +1578,7 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, /* * For now, we keep it simple and only move between writable VMAs. - * Access flags are equal, therefore cheching only the source is enough. + * Access flags are equal, therefore checking only the source is enough. */ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL; diff --git a/mm/vma.c b/mm/vma.c index abe0da33c844..9127eaeea93f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -109,7 +109,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; - struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ + struct vm_area_struct *src = vmg->middle; /* existing merge case. */ struct anon_vma *tgt_anon = tgt->anon_vma; struct anon_vma *src_anon = vmg->anon_vma; @@ -798,7 +798,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: - * - The caller must assign the VMA to be modifed to @vmg->middle. + * - The caller must assign the VMA to be modified to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. From 11119b19f62dc8f3aac6e458fb27a468ad5861ce Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 1 Oct 2025 05:25:02 +0100 Subject: [PATCH 003/321] mm/ptdump: replace READ_ONCE() with standard page table accessors Replace READ_ONCE() with standard page table accessors i.e pxdp_get() which anyways default into READ_ONCE() in cases where platform does not override. Also convert ptep_get_lockless() into ptep_get() as well. Link: https://lkml.kernel.org/r/20251001042502.1400726-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Dev Jain Acked-by: Lance Yang Acked-by: SeongJae Park Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/ptdump.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/ptdump.c b/mm/ptdump.c index b600c7f864b8..973020000096 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -31,7 +31,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pgd_t val = READ_ONCE(*pgd); + pgd_t val = pgdp_get(pgd); #if CONFIG_PGTABLE_LEVELS > 4 && \ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) @@ -54,7 +54,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - p4d_t val = READ_ONCE(*p4d); + p4d_t val = p4dp_get(p4d); #if CONFIG_PGTABLE_LEVELS > 3 && \ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) @@ -77,7 +77,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pud_t val = READ_ONCE(*pud); + pud_t val = pudp_get(pud); #if CONFIG_PGTABLE_LEVELS > 2 && \ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) @@ -100,7 +100,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pmd_t val = READ_ONCE(*pmd); + pmd_t val = pmdp_get(pmd); #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte))) @@ -121,7 +121,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pte_t val = ptep_get_lockless(pte); + pte_t val = ptep_get(pte); if (st->effective_prot_pte) st->effective_prot_pte(st, val); From 9ff86ca1cccc071db5ede284852728027412fd88 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:26 +0200 Subject: [PATCH 004/321] lib/test_vmalloc: add no_block_alloc_test case Patch series "__vmalloc()/kvmalloc() and no-block support", v4. This patch (of 10): Introduce a new test case "no_block_alloc_test" that verifies non-blocking allocations using __vmalloc() with GFP_ATOMIC and GFP_NOWAIT flags. It is recommended to build kernel with CONFIG_DEBUG_ATOMIC_SLEEP enabled to help catch "sleeping while atomic" issues. This test ensures that memory allocation logic under atomic constraints does not inadvertently sleep. Link: https://lkml.kernel.org/r/20251007122035.56347-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 2815658ccc37..aae5f4910aff 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -54,6 +54,7 @@ __param(int, run_test_mask, 7, "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n" "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" "\t\tid: 1024, name: vm_map_ram_test\n" + "\t\tid: 2048, name: no_block_alloc_test\n" /* Add a new test case description here. */ ); @@ -283,6 +284,30 @@ static int fix_size_alloc_test(void) return 0; } +static int no_block_alloc_test(void) +{ + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + bool use_atomic = !!(get_random_u8() % 2); + gfp_t gfp = use_atomic ? GFP_ATOMIC : GFP_NOWAIT; + unsigned long size = (nr_pages > 0 ? nr_pages : 1) * PAGE_SIZE; + + preempt_disable(); + ptr = __vmalloc(size, gfp); + preempt_enable(); + + if (!ptr) + return -1; + + *((__u8 *)ptr) = 0; + vfree(ptr); + } + + return 0; +} + static int pcpu_alloc_test(void) { @@ -411,6 +436,7 @@ static struct test_case_desc test_case_array[] = { { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test, }, { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, }, { "vm_map_ram_test", vm_map_ram_test, }, + { "no_block_alloc_test", no_block_alloc_test, true }, /* Add a new test case here. */ }; From e781c1c0a9fcb462181ebe95b271221e96cf2aa1 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:27 +0200 Subject: [PATCH 005/321] lib/test_vmalloc: remove xfail condition check A test marked with "xfail = true" is expected to fail but that does not mean it is predetermined to fail. Remove "xfail" condition check for tests which pass successfully. Link: https://lkml.kernel.org/r/20251007122035.56347-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Marco Elver Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index aae5f4910aff..6521c05c7816 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -500,7 +500,7 @@ static int test_func(void *private) for (j = 0; j < test_repeat_count; j++) { ret = test_case_array[index].test_func(); - if (!ret && !test_case_array[index].xfail) + if (!ret) t->data[index].test_passed++; else if (ret && test_case_array[index].xfail) t->data[index].test_xfailed++; From 86e968d8ca6dc823086b4436721a6e3a10241503 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:28 +0200 Subject: [PATCH 006/321] mm/vmalloc: support non-blocking GFP flags in alloc_vmap_area() alloc_vmap_area() currently assumes that sleeping is allowed during allocation. This is not true for callers which pass non-blocking GFP flags, such as GFP_ATOMIC or GFP_NOWAIT. This patch adds logic to detect whether the given gfp_mask permits blocking. It avoids invoking might_sleep() or falling back to reclaim path if blocking is not allowed. This makes alloc_vmap_area() safer for use in non-sleeping contexts, where previously it could hit unexpected sleeps, trigger warnings. It is a preparation and adjustment step to later allow both GFP_ATOMIC and GFP_NOWAIT allocations in this series. Link: https://lkml.kernel.org/r/20251007122035.56347-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 798b2ed21e46..d83c01caaabe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2017,6 +2017,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long freed; unsigned long addr; unsigned int vn_id; + bool allow_block; int purged = 0; int ret; @@ -2028,7 +2029,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, /* Only reclaim behaviour flags are relevant. */ gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - might_sleep(); + allow_block = gfpflags_allow_blocking(gfp_mask); + might_sleep_if(allow_block); /* * If a VA is obtained from a global heap(if it fails here) @@ -2062,7 +2064,8 @@ retry: * This is not a fast path. Check if yielding is needed. This * is the only reschedule point in the vmalloc() path. */ - cond_resched(); + if (allow_block) + cond_resched(); } trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); @@ -2071,8 +2074,16 @@ retry: * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (IS_ERR_VALUE(addr)) - goto overflow; + if (IS_ERR_VALUE(addr)) { + if (allow_block) + goto overflow; + + /* + * We can not trigger any reclaim logic because + * sleeping is not allowed, thus fail an allocation. + */ + goto out_free_va; + } va->va_start = addr; va->va_end = addr + size; @@ -2122,6 +2133,7 @@ overflow: pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", size, vstart, vend); +out_free_va: kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } From 9c47753167a6a585d0305663c6912f042e131c2d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:29 +0200 Subject: [PATCH 007/321] mm/vmalloc: defer freeing partly initialized vm_struct __vmalloc_area_node() may call free_vmap_area() or vfree() on error paths, both of which can sleep. This becomes problematic if the function is invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. To fix this, unify error paths and defer the cleanup of partly initialized vm_struct objects to a workqueue. This ensures that freeing happens in a process context and avoids invalid sleeps in atomic regions. Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 6 +++++- mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..1e43181369f1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d83c01caaabe..9e29dd767c41 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3687,6 +3687,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static LLIST_HEAD(pending_vm_area_cleanup); +static void cleanup_vm_area_work(struct work_struct *work) +{ + struct vm_struct *area, *tmp; + struct llist_node *head; + + head = llist_del_all(&pending_vm_area_cleanup); + if (!head) + return; + + llist_for_each_entry_safe(area, tmp, head, llnode) { + if (!area->pages) + free_vm_area(area); + else + vfree(area->addr); + } +} + +/* + * Helper for __vmalloc_area_node() to defer cleanup + * of partially initialized vm_struct in error paths. + */ +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); +static void defer_vm_area_cleanup(struct vm_struct *area) +{ + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) + schedule_work(&cleanup_vm_area); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3718,8 +3747,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3796,7 +3824,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - vfree(area->addr); + defer_vm_area_cleanup(area); return NULL; } From 8da89ba18ed4e9000d9b9b5b1f699e5004f4abf6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:30 +0200 Subject: [PATCH 008/321] mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node() Make __vmalloc_area_node() respect non-blocking GFP masks such as GFP_ATOMIC and GFP_NOWAIT. - Add memalloc_apply_gfp_scope()/memalloc_restore_scope() helpers to apply a proper scope. - Apply memalloc_apply_gfp_scope()/memalloc_restore_scope() around vmap_pages_range() for page table setup. - Set "nofail" to false if a non-blocking mask is used, as they are mutually exclusive. This is particularly important for page table allocations that internally use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are applied. For example: __pte_alloc_kernel() pte_alloc_one_kernel(&init_mm); pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); Note: in most cases, PTE entries are established only up to the level required by current vmap space usage, meaning the page tables are typically fully populated during the mapping process. Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 52 +++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1e43181369f1..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -332,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9e29dd767c41..d8bcd87239b5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3716,6 +3716,42 @@ static void defer_vm_area_cleanup(struct vm_struct *area) schedule_work(&cleanup_vm_area); } +/* + * Page tables allocations ignore external GFP. Enforces it by + * the memalloc scope API. It is used by vmalloc internals and + * KASAN shadow population only. + * + * GFP to scope mapping: + * + * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() + * GFP_NOFS - memalloc_nofs_save() + * GFP_NOIO - memalloc_noio_save() + * + * Returns a flag cookie to pair with restore. + */ +unsigned int +memalloc_apply_gfp_scope(gfp_t gfp_mask) +{ + unsigned int flags = 0; + + if (!gfpflags_allow_blocking(gfp_mask)) + flags = memalloc_noreclaim_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + /* 0 - no scope applied. */ + return flags; +} + +void +memalloc_restore_scope(unsigned int flags) +{ + if (flags) + memalloc_flags_restore(flags); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3732,6 +3768,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ + if (!gfpflags_allow_blocking(gfp_mask)) + nofail = false; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -3797,22 +3837,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, From ad435e79f8f5d6a5dae8ec122b14802d810defbf Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:31 +0200 Subject: [PATCH 009/321] mm/kasan: support non-blocking GFP in kasan_populate_vmalloc() A "gfp_mask" is already passed to kasan_populate_vmalloc() as an argument to respect GFPs from callers and KASAN uses it for its internal allocations. But apply_to_page_range() function ignores GFP flags due to a hard-coded mask. Wrap the call with memalloc_apply_gfp_scope()/memalloc_restore_scope() so that non-blocking GFP flags(GFP_ATOMIC, GFP_NOWAIT) are respected. Link: https://lkml.kernel.org/r/20251007122035.56347-7-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Marco Elver Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/kasan/shadow.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 5d2a876035d6..a30d84bfdd52 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -377,18 +377,10 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); ___free_pages_bulk(data.pages, nr_pages); if (ret) From b186a94227b753f2fdcab0df29dfc636c63ac329 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:32 +0200 Subject: [PATCH 010/321] kmsan: remove hard-coded GFP_KERNEL flags kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays with GFP_KERNEL, which may sleep. This is inconsistent with vmalloc() as it will support non-blocking requests later. Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use it internally for its demand. Please note, the subsequent __vmap_pages_range_noflush() still uses GFP_KERNEL and can sleep. If a caller runs under reclaim constraints, sleeping is forbidden, it must establish the appropriate memalloc scope API. Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 6 ++++-- mm/internal.h | 4 ++-- mm/kmsan/shadow.c | 6 +++--- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 26 +++++++++++++++++--------- 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..e623c8103358 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift); + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) @@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void) static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return -EINVAL; } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 55fdea199aaf..e7f554a31bb4 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -215,7 +215,7 @@ void kmsan_free_page(struct page *page, unsigned int order) int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift) + unsigned int page_shift, gfp_t gfp_mask) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; @@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, return 0; nr = (end - start) / PAGE_SIZE; - s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); - o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask); + o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask); if (!s_pages || !o_pages) { err = -ENOMEM; goto ret; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index cd69caf6aa8d..4f5937090590 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), - PAGE_KERNEL, pages, PAGE_SHIFT); + PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d8bcd87239b5..d7e7049e01f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -671,16 +671,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, - page_shift); + page_shift, gfp_mask); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } +static int __vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask); + flush_cache_vmap(addr, end); + return err; +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map @@ -696,11 +708,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; + return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL); } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, @@ -3839,8 +3847,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, */ flags = memalloc_apply_gfp_scope(gfp_mask); do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); + ret = __vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift, nested_gfp); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); From 7241bb2ea33d5ff50b77a5981342bcc826bef52a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:33 +0200 Subject: [PATCH 011/321] mm: skip might_alloc() warnings when PF_MEMALLOC is set might_alloc() catches invalid blocking allocations in contexts where sleeping is not allowed. However when PF_MEMALLOC is set, the page allocator already skips reclaim and other blocking paths. In such cases, a blocking gfp_mask does not actually lead to blocking, so triggering might_alloc() splats is misleading. Adjust might_alloc() to skip warnings when the current task has PF_MEMALLOC set, matching the allocator's actual blocking behaviour. Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..a74582aed747 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -318,6 +318,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } From 0667b209e92965da8c2006b673bea69050ede1a2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:34 +0200 Subject: [PATCH 012/321] mm/vmalloc: update __vmalloc_node_range() documentation __vmalloc() now supports non-blocking flags such as GFP_ATOMIC and GFP_NOWAIT. Update the documentation accordingly. Link: https://lkml.kernel.org/r/20251007122035.56347-10-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d7e7049e01f8..9a63c91c6150 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3881,19 +3881,20 @@ fail: * @caller: caller's return address * * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all - * supported. - * Zone modifiers are not supported. From the reclaim modifiers - * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) - * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and - * __GFP_RETRY_MAYFAIL are not supported). + * allocator with @gfp_mask flags and map them into contiguous + * virtual range with protection @prot. * - * __GFP_NOWARN can be used to suppress failures messages. + * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT, + * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported. + * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only + * by __vmalloc(). * - * Map them into contiguous kernel virtual space, using a pagetable - * protection of @prot. + * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY + * and %__GFP_RETRY_MAYFAIL are not supported. * + * %__GFP_NOWARN can be used to suppress failure messages. + * + * Can not be called from interrupt nor NMI contexts. * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, From c6307674ed82c0c57d6e1e3408e84ac449ab8e94 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:35 +0200 Subject: [PATCH 013/321] mm: kvmalloc: add non-blocking support for vmalloc Extend __kvmalloc_node_noprof() to handle non-blocking GFP flags (GFP_NOWAIT and GFP_ATOMIC). Previously such flags were rejected, returning NULL. With this change: - kvmalloc() can fall back to vmalloc() if non-blocking contexts; - for non-blocking allocations the VM_ALLOW_HUGE_VMAP option is disabled, since the huge mapping path still contains might_sleep(); - documentation update to reflect that GFP_NOWAIT and GFP_ATOMIC are now supported. Link: https://lkml.kernel.org/r/20251007122035.56347-11-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/slub.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 927ca64b6cbe..3ea9b7af660d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7090,7 +7090,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * @@ -7099,6 +7099,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, gfp_t flags, int node) { + bool allow_block; void *ret; /* @@ -7111,16 +7112,22 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, if (ret || size <= PAGE_SIZE) return ret; - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } + /* + * For non-blocking the VM_ALLOW_HUGE_VMAP is not used + * because the huge-mapping path in vmalloc contains at + * least one might_sleep() call. + * + * TODO: Revise huge-mapping path to support non-blocking + * flags. + */ + allow_block = gfpflags_allow_blocking(flags); + /* * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, * since the callers already cannot assume anything @@ -7128,7 +7135,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, * protection games. */ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kvmalloc_node_noprof); From 590c03ca6a3fbb114396673314e2aa483839608b Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:28:21 +0800 Subject: [PATCH 014/321] mm/ksm: fix exec/fork inheritance support for prctl Patch series "ksm: fix exec/fork inheritance", v2. This series fixes exec/fork inheritance. See the detailed description of the issue below. This patch (of 2): Background ========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by prctl() so that the process's VMAs are forcibly scanned by ksmd. Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve(). Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl") fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY by adding the mm_slot to ksm_mm_head in __bprm_mm_init(). Problem ======= In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY during exec/fork can fail. For example, when the scanning frequency of ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may still fail to pass it to the newly exec'd process. This happens because ksm_execve() is executed too early in the do_execve flow (prematurely adding the new mm_struct to the ksm_mm_slot list). As a result, before do_execve completes, ksmd may have already performed a scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus clearing its MMF_VM_MERGE_ANY flag. Consequently, when the new program executes, the flag MMF_VM_MERGE_ANY inheritance missed. Root reason =========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs. Solution ======== Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot list, and its process has not yet officially started running or has not yet performed mmap/brk to allocate anonymous VMAS. Secondly, recheck MMF_VM_MERGEABLE again if a process takes MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list again. Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Signed-off-by: xu xin Cc: Stefan Roesch Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Wang Yaxin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; diff --git a/mm/ksm.c b/mm/ksm.c index cdefba633856..4f672f4f2140 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2712,8 +2712,14 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); + /* + * Only clear MMF_VM_MERGEABLE. We must not clear + * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process, + * perhaps their mm_struct has just been added to ksm_mm_slot + * list, and its process has not yet officially started running + * or has not yet performed mmap/brk to allocate anonymous VMAS. + */ mm_flags_clear(MMF_VM_MERGEABLE, mm); - mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2831,12 +2837,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * Returns: @vm_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) + __ksm_should_add_vma(file, vm_flags)) { vm_flags |= VM_MERGEABLE; + /* + * Generally, the flags here always include MMF_VM_MERGEABLE. + * However, in rare cases, this flag may be cleared by ksmd who + * scans a cycle without finding any mergeable vma. + */ + if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm))) + __ksm_enter(mm); + } return vm_flags; } From bda7bf06840d4eb133abefa5a2fd75544277bd86 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:29:35 +0800 Subject: [PATCH 015/321] selftests: update ksm inheritance tests for prctl fork/exec To reproduce the issue mentioned by [1], this add a setting of pages_to_scan and sleep_millisecs at the start of test_prctl_fork_exec(). The main change is just raise the scanning frequency of ksmd. [1] https://lore.kernel.org/all/202510012256278259zrhgATlLA2C510DMD3qI@zte.com.cn/ Link: https://lkml.kernel.org/r/20251007182935207jm31wCIgLpZg5XbXQY64S@zte.com.cn Signed-off-by: xu xin Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Stefan Roesch Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index ac136f04b8d6..95afa5cfc062 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -38,6 +38,8 @@ enum ksm_merge_mode { }; static int mem_fd; +static int pages_to_scan_fd; +static int sleep_millisecs_fd; static int pagemap_fd; static size_t pagesize; @@ -493,6 +495,46 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } +static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "1", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, pages_to_scan, strlen(pages_to_scan)) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, sleep_ms, strlen(sleep_ms)) <= 0) + return -errno; + + return 0; +} + +static int stop_ksmd_and_restore_frequency(void) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "2", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, "100", 3) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, "20", 2) <= 0) + return -errno; + + return 0; +} + static void test_prctl_fork_exec(void) { int ret, status; @@ -500,6 +542,9 @@ static void test_prctl_fork_exec(void) ksft_print_msg("[RUN] %s\n", __func__); + if (start_ksmd_and_set_frequency("2000", "0")) + ksft_test_result_fail("set ksmd's scanning frequency failed\n"); + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); @@ -542,6 +587,11 @@ static void test_prctl_fork_exec(void) return; } + if (stop_ksmd_and_restore_frequency()) { + ksft_test_result_fail("restore ksmd frequency failed\n"); + return; + } + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } @@ -656,6 +706,13 @@ static void init_global_file_handles(void) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); if (ksm_get_self_merging_pages() < 0) ksft_exit_skip("accessing \"/proc/self/ksm_merging_pages\") failed\n"); + + pages_to_scan_fd = open("/sys/kernel/mm/ksm/pages_to_scan", O_RDWR); + if (pages_to_scan_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/pages_to_scan failed\n"); + sleep_millisecs_fd = open("/sys/kernel/mm/ksm/sleep_millisecs", O_RDWR); + if (sleep_millisecs_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/sleep_millisecs failed\n"); } int main(int argc, char **argv) From c0efdb373c3aaacb32db59cadb0710cac13e44ae Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 7 Oct 2025 07:31:00 +0100 Subject: [PATCH 016/321] mm: replace READ_ONCE() with standard page table accessors Replace all READ_ONCE() with a standard page table accessors i.e pxdp_get() that defaults into READ_ONCE() in cases where platform does not override. Link: https://lkml.kernel.org/r/20251007063100.2396936-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Lance Yang Reviewed-by: Wei Yang Reviewed-by: Dev Jain Signed-off-by: Andrew Morton --- mm/gup.c | 10 +++++----- mm/hmm.c | 2 +- mm/memory.c | 4 ++-- mm/mprotect.c | 2 +- mm/sparse-vmemmap.c | 2 +- mm/vmscan.c | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index d2524fe09338..95d948c8e86c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -950,7 +950,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; pudp = pud_offset(p4dp, address); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (!pud_present(pud)) return no_page_table(vma, flags, address); if (pud_leaf(pud)) { @@ -975,7 +975,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, p4d_t *p4dp, p4d; p4dp = p4d_offset(pgdp, address); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); BUILD_BUG_ON(p4d_leaf(p4d)); if (!p4d_present(p4d) || p4d_bad(p4d)) @@ -3060,7 +3060,7 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, pudp = pud_offset_lockless(p4dp, p4d, addr); do { - pud_t pud = READ_ONCE(*pudp); + pud_t pud = pudp_get(pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) @@ -3086,7 +3086,7 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { - p4d_t p4d = READ_ONCE(*p4dp); + p4d_t p4d = p4dp_get(p4dp); next = p4d_addr_end(addr, end); if (!p4d_present(p4d)) @@ -3108,7 +3108,7 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, pgdp = pgd_offset(current->mm, addr); do { - pgd_t pgd = READ_ONCE(*pgdp); + pgd_t pgd = pgdp_get(pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) diff --git a/mm/hmm.c b/mm/hmm.c index 87562914670a..a56081d67ad6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -491,7 +491,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, /* Normally we don't want to split the huge page */ walk->action = ACTION_CONTINUE; - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (!pud_present(pud)) { spin_unlock(ptl); return hmm_vma_walk_hole(start, end, -1, walk); diff --git a/mm/memory.c b/mm/memory.c index 61748b762876..f13b20b702f6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6690,12 +6690,12 @@ retry: goto out; p4dp = p4d_offset(pgdp, address); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 113b48985834..988c366137d5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -599,7 +599,7 @@ again: break; } - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (pud_none(pud)) continue; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index dbd8daccade2..37522d6cb398 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -439,7 +439,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, return -ENOMEM; pmd = pmd_offset(pud, addr); - if (pmd_none(READ_ONCE(*pmd))) { + if (pmd_none(pmdp_get(pmd))) { void *p; p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3d..2239de111fa6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3773,7 +3773,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, pud = pud_offset(p4d, start & P4D_MASK); restart: for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { - pud_t val = READ_ONCE(pud[i]); + pud_t val = pudp_get(pud + i); next = pud_addr_end(addr, end); From 9f1edf1aedac1b287355f63f768ba4275de72dca Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Oct 2025 10:51:06 -0700 Subject: [PATCH 017/321] mm: readahead: make thp readahead conditional to mmap_miss logic Commit 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") introduced a special handling for VM_HUGEPAGE mappings: even if the readahead is disabled, 1 or 2 HPAGE_PMD_ORDER pages are allocated. This change causes a significant regression for containers with a tight memory.max limit, if VM_HUGEPAGE is widely used. Prior to this commit, mmap_miss logic would eventually lead to the readahead disablement, effectively reducing the memory pressure in the cgroup. With this change the kernel is trying to allocate 1-2 huge pages for each fault, no matter if these pages are used or not before being evicted, increasing the memory pressure multi-fold. To fix the regression, let's make the new VM_HUGEPAGE conditional to the mmap_miss check, but keep independent from the ra->ra_pages. This way the main intention of commit 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") stays intact, but the regression is resolved. The logic behind this changes is simple: even if a user explicitly requests using huge pages to back the file mapping (using VM_HUGEPAGE flag), under a very strong memory pressure it's better to fall back to ordinary pages. Link: https://lkml.kernel.org/r/20251006175106.377411-1-roman.gushchin@linux.dev Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") Signed-off-by: Roman Gushchin Reviewed-by: Dev Jain Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 68 +++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2f1e7e283a51..526ad8c92250 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3253,11 +3253,47 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; + bool force_thp_readahead = false; unsigned short mmap_miss; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) + force_thp_readahead = true; + + if (!force_thp_readahead) { + /* + * If we don't want any read-ahead, don't bother. + * VM_EXEC case below is already intended for random access. + */ + if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) + return fpin; + + if (!ra->ra_pages) + return fpin; + + if (vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + page_cache_sync_ra(&ractl, ra->ra_pages); + return fpin; + } + } + + if (!(vm_flags & VM_SEQ_READ)) { + /* Avoid banging the cache line if not needed */ + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss < MMAP_LOTSAMISS * 10) + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (mmap_miss > MMAP_LOTSAMISS) + return fpin; + } + + if (force_thp_readahead) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3272,34 +3308,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) page_cache_ra_order(&ractl, ra); return fpin; } -#endif - - /* - * If we don't want any read-ahead, don't bother. VM_EXEC case below is - * already intended for random access. - */ - if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) - return fpin; - if (!ra->ra_pages) - return fpin; - - if (vm_flags & VM_SEQ_READ) { - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra->ra_pages); - return fpin; - } - - /* Avoid banging the cache line if not needed */ - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss < MMAP_LOTSAMISS * 10) - WRITE_ONCE(ra->mmap_miss, ++mmap_miss); - - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (mmap_miss > MMAP_LOTSAMISS) - return fpin; if (vm_flags & VM_EXEC) { /* From bd63d0fde2a2c328fe30456b8aa2521222c6f3fe Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Mon, 6 Oct 2025 09:49:48 +0800 Subject: [PATCH 018/321] mm/vmscan: remove redundant __GFP_NOWARN The __GFP_NOWARN flag was included in GFP_NOWAIT since commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT"). So remove the redundant __GFP_NOWARN flag. Link: https://lkml.kernel.org/r/20251006014948.44695-1-wangfushuai@baidu.com Signed-off-by: Fushuai Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Cc: Axel Rasmussen Cc: Johannes Weiner Cc: Michal Hocko Cc: Qi Zheng Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2239de111fa6..bba0d075b2bb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1054,7 +1054,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, * When this happens, 'page' will likely just be discarded * instead of migrated. */ - .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, .nmask = &allowed_mask, From 138336d674d2e51f1e5699d2a30af1e9aa1352b4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Oct 2025 13:38:48 -0700 Subject: [PATCH 019/321] mm/zswap: remove unnecessary dlen writes for incompressible pages Patch series "mm/zswap: misc cleanup of code and documentations". Clean up an unnecessary local variable write in incompressible pages handling, typos (s/zwap/zswap/) and outdated comments/documentations about the zswap's red-black tree, which is replaced by xarray. This patch (of 4): Incompressible pages handling logic in zswap_compress() is setting 'dlen' as PAGE_SIZE twice. Once before deciding whether to save the content as is, and once again after it is decided to save it as is. But the value of 'dlen' is used only if it is decided to save the content as is, so the first write is unnecessary. It is not causing real user issues, but making code confusing to read. Remove the unnecessary write operation. Link: https://lkml.kernel.org/r/20251003203851.43128-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251003203851.43128-2-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: Yosry Ahmed Acked-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: David Hildenbrand Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Chris Li Signed-off-by: Andrew Morton --- mm/zswap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index c1af782e54ec..80619c8589a7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -894,7 +894,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * to the active LRU list in the case. */ if (comp_ret || !dlen || dlen >= PAGE_SIZE) { - dlen = PAGE_SIZE; if (!mem_cgroup_zswap_writeback_enabled( folio_memcg(page_folio(page)))) { comp_ret = comp_ret ? comp_ret : -EINVAL; From f7ed6bf2372df0123a43cb680d5e63063a9f2d49 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Oct 2025 13:38:49 -0700 Subject: [PATCH 020/321] mm/zswap: fix typos: s/zwap/zswap/ As the subject says. Link: https://lkml.kernel.org/r/20251003203851.43128-3-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: Yosry Ahmed Acked-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- mm/zswap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f4..3ae5cbcaed75 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5443,7 +5443,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) * @size: size of compressed object * * This forces the charge after obj_cgroup_may_zswap() allowed - * compression and storage in zwap for this cgroup to go ahead. + * compression and storage in zswap for this cgroup to go ahead. */ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { diff --git a/mm/zswap.c b/mm/zswap.c index 80619c8589a7..f6b1c8832a4f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -879,7 +879,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done - * in one thread doing zwap. + * in one thread doing zswap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ @@ -1128,7 +1128,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * * 1. We extract the swp_entry_t to the stack, allowing * zswap_writeback_entry() to pin the swap entry and - * then validate the zwap entry against that swap entry's + * then validate the zswap entry against that swap entry's * tree using pointer value comparison. Only when that * is successful can the entry be dereferenced. * From 1f52f3de4bf8adc15732d2eb7d4e2be222245d5e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Oct 2025 13:38:50 -0700 Subject: [PATCH 021/321] mm/zswap: s/red-black tree/xarray/ Changes made by commit 796c2c23e14e ("zswap: replace RB tree with xarray") are not reflected on a comment. Update the comment. Link: https://lkml.kernel.org/r/20251003203851.43128-4-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: Yosry Ahmed Acked-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index f6b1c8832a4f..5d0f8b13a958 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -175,7 +175,7 @@ static struct shrinker *zswap_shrinker; * This structure contains the metadata for tracking a single compressed * page within zswap. * - * swpentry - associated swap entry, the offset indexes into the red-black tree + * swpentry - associated swap entry, the offset indexes into the xarray * length - the length in bytes of the compressed page data. Needed during * decompression. * referenced - true if the entry recently entered the zswap pool. Unset by the From 0fdaa13ee93a068251d32ea7f60fd439b445adbe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Oct 2025 13:38:51 -0700 Subject: [PATCH 022/321] Docs/admin-guide/mm/zswap: s/red-black tree/xarray/ The change from commit 796c2c23e14e ("zswap: replace RB tree with xarray") is not reflected on the document. Update the document. Link: https://lkml.kernel.org/r/20251003203851.43128-5-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: Yosry Ahmed Acked-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 283d77217c6f..2464425c783d 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -59,11 +59,11 @@ returned by the allocation routine and that handle must be mapped before being accessed. The compressed memory pool grows on demand and shrinks as compressed pages are freed. The pool is not preallocated. -When a swap page is passed from swapout to zswap, zswap maintains a mapping -of the swap entry, a combination of the swap type and swap offset, to the -zsmalloc handle that references that compressed swap page. This mapping is -achieved with a red-black tree per swap type. The swap offset is the search -key for the tree nodes. +When a swap page is passed from swapout to zswap, zswap maintains a mapping of +the swap entry, a combination of the swap type and swap offset, to the zsmalloc +handle that references that compressed swap page. This mapping is achieved +with an xarray per swap type. The swap offset is the search key for the xarray +nodes. During a page fault on a PTE that is a swap entry, the swapin code calls the zswap load function to decompress the page into the page allocated by the page From 9ac09bb9feaccc2f45e5606dc48a3f748d478dc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 Oct 2025 16:53:04 +0100 Subject: [PATCH 023/321] mm: consistently use current->mm in mm_get_unmapped_area() mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() / arch_get_unmapped_area_topdown(), both of which search current->mm for some free space. Neither take an mm_struct - they implicitly operate on current->mm. But the wrapper takes an mm_struct and uses it to decide whether to search bottom up or top down. All callers pass in current->mm for this, so everything is working consistently. But it feels like an accident waiting to happen; eventually someone will call that function with a different mm, expecting to find free space in it, but what gets returned is free space in the current mm. So let's simplify by removing the parameter and have the wrapper use current->mm to decide which end to start at. Now everything is consistent and self-documenting. Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/sparc/kernel/sys_sparc_64.c | 6 +++--- arch/x86/kernel/cpu/sgx/driver.c | 2 +- drivers/char/mem.c | 2 +- drivers/dax/device.c | 5 ++--- fs/hugetlbfs/inode.c | 3 +-- fs/proc/inode.c | 2 +- fs/ramfs/file-mmu.c | 2 +- include/linux/sched/mm.h | 9 ++++----- io_uring/memmap.c | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/huge_memory.c | 4 ++-- mm/mmap.c | 17 +++++++---------- mm/shmem.c | 8 +++----- 14 files changed, 29 insertions(+), 37 deletions(-) diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 55faf2effa46..dbf118b40601 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -241,7 +241,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -254,7 +254,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u align_goal = (64UL * 1024); do { - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, + addr = mm_get_unmapped_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); @@ -273,7 +273,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u * be obtained. */ if (addr & ~PAGE_MASK) - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + addr = mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); return addr; } diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 7f8d1e11dbee..3b3efadb8cae 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 34b815901b20..db1ca53a6d01 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -542,7 +542,7 @@ static unsigned long get_unmapped_area_zero(struct file *file, #ifdef CONFIG_TRANSPARENT_HUGEPAGE return thp_get_unmapped_area(file, addr, len, pgoff, flags); #else - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); #endif } #endif /* CONFIG_MMU */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 2bb40a6060af..7f1ed0db8337 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -330,14 +330,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f42548ee9083..ce8e40d35032 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -184,8 +184,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, - flags, 0); + return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d9b7ef122343..2d3425cfa94b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif return orig_addr; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b11f5b20b78b..c3ed1c5117b2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a74582aed747..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/io_uring/memmap.c b/io_uring/memmap.c index add03ca75cb9..63fcfa757bb8 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -387,7 +387,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else addr = 0UL; #endif - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1074ac4459f2..872dc0e41c65 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..d77685f2c6cb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); #else return addr; #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2a521e5d68..32479ae27400 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1127,7 +1127,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* @@ -1164,7 +1164,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add if (ret) return ret; - return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); } diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d..644f02071a41 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags) +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { - if (mm_flags_test(MMF_TOPDOWN, mm)) + if (mm_flags_test(MMF_TOPDOWN, current->mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { - addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + addr = mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) @@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } unsigned long -mm_get_unmapped_area(struct mm_struct *mm, struct file *file, - unsigned long addr, unsigned long len, +mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area_vmflags(mm, file, addr, len, - pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); diff --git a/mm/shmem.c b/mm/shmem.c index 58701d14dd96..0eecb486a0cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2759,8 +2759,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, - flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2838,8 +2837,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, - inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -5775,7 +5773,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif From b4e53984f24082005a92ec76f91348a73653dadb Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 6 Oct 2025 06:52:14 +0100 Subject: [PATCH 024/321] mm/dirty: replace READ_ONCE() with pudp_get() Replace READ_ONCE() with a standard page table accessor i.e pudp_get() that anyways defaults into READ_ONCE() in cases where platform does not override Link: https://lkml.kernel.org/r/20251006055214.1845342-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Dev Jain Reviewed-by: Oscar Salvador Cc: Lance Yang Signed-off-by: Andrew Morton --- mm/mapping_dirty_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index c193de6cb23a..737c407f4081 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -149,7 +149,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - pud_t pudval = READ_ONCE(*pud); + pud_t pudval = pudp_get(pud); /* Do not split a huge pud */ if (pud_trans_huge(pudval)) { From 156c0c5d1463c26348316864f7f0dc8bf809f454 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 1 Oct 2025 14:56:07 -0300 Subject: [PATCH 025/321] mm/page_owner: introduce struct stack_print_ctx Patch series "mm/page_owner: add debugfs files 'show_handles' and 'show_stacks_handles'", v2. Context: The page_owner debug feature can help understand a particular situation in in a point in time (e.g., identify biggest memory consumers; verify memory counters that do not add up). Another useful usecase is to collect data repeatedly over time, and use it for profiling, monitoring, and even comparing different kernel versions, at the stack trace level (e.g., watch for trends, leaks, correlations, and regressions). For this usecase, userspace periorically collects the data from page_owner and organizes it in data structures appropriate for access per-stack trace. Problem: The usecase of tracking memory usage per stack trace (or tracking it for a particular stack trace) requires uniquely identifying each stack trace (i.e., keys to store their memory usage over periodic data collections). This has to be done for every stack trace in every sample/data collection, even if tracking only one stack trace (to identify it among all others). Therefore, an approach like hashing the stack traces in userspace to create unique keys/identifiers for them during post-processing can quickly become expensive, considering the repetition and a growing number of stack traces. Solution: Fortunately, the kernel can provide a unique identifier for stack traces in page_owner, which is the handle number in stackdepot. This eliminates the need for creating keys (hashing) in userspace during post-processing. Additionally, with that information, the stack traces themselves are not needed until the memory usage should be resolved from a handle to a stack trace (say, to look at the stack traces of a few top consumers). This can reduce the amount of text emitted/copied by the kernel to userspace, and save userspace from matching and discarding stack traces when not needed. Changes: This patchset adds 2 files to provide information, like 'show_stacks': - show_handles: print handle number and number of pages (no stack traces) - show_stacks_handles: print handle numbers and stack traces (no pages) Now, it's possible to periodically collect data with handle numbers (keys) and without stack traces (lower overhead) from 'show_handles', and later do a final collection with handles and stack traces from 'show_stacks_handles' to resolve the handles to their stack traces. The output format follows the existing 'show_stacks' file, for simplicity, but it can certainly be changed if a different format is more convenient. Example: The number of base pages collected can be stored per-handle number over the periodic data collections, and finally resolved to stack traces per-handle number as well with a final collection. Later, one can, for example, identify the biggest consumers and watch their trends or correlate increases/decreases with other events in the system, or watch a particular stack trace(s) of interest during development. Testing: Tested on next-20250929. - show_stacks: register_dummy_stack+0x32/0x70 init_page_owner+0x29/0x2f0 page_ext_init+0x27c/0x2b0 mm_core_init+0xdc/0x110 nr_base_pages: 47 - show_handles: handle: 1 nr_base_pages: 47 - show_stacks_handles: register_dummy_stack+0x32/0x70 init_page_owner+0x29/0x2f0 page_ext_init+0x27c/0x2b0 mm_core_init+0xdc/0x110 handle: 1 - count_threshold: # echo 100 >/sys/kernel/debug/page_owner_stacks/count_threshold # grep register_dummy_stack show_stacks # not present # grep -B4 '^handle: 1$' show_handles # not present # grep -B4 '^handle: 1$' show_stacks_handles # present register_dummy_stack+0x32/0x70 init_page_owner+0x29/0x2f0 page_ext_init+0x27c/0x2b0 mm_core_init+0xdc/0x110 handle: 1 This patch (of 5): Currently, struct seq_file.private is used as an iterator in stack_list by stack_start|next(), for stack_print(). Create a context struct for this, in order to add another field next. No behavior change intended. P.S.: page_owner_stack_open() is expanded with separate statements for variable definition and return just in preparation for the next patch. Link: https://lkml.kernel.org/r/20251001175611.575861-1-mfo@igalia.com Link: https://lkml.kernel.org/r/20251001175611.575861-2-mfo@igalia.com Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Oscar Salvador Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 589ec37c94aa..05e26c9d43ef 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -45,6 +45,10 @@ static struct stack failure_stack; static struct stack *stack_list; static DEFINE_SPINLOCK(stack_list_lock); +struct stack_print_ctx { + struct stack *stack; +}; + static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -859,6 +863,7 @@ static const struct file_operations proc_page_owner_operations = { static void *stack_start(struct seq_file *m, loff_t *ppos) { struct stack *stack; + struct stack_print_ctx *ctx = m->private; if (*ppos == -1UL) return NULL; @@ -870,9 +875,9 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) * value of stack_list. */ stack = smp_load_acquire(&stack_list); - m->private = stack; + ctx->stack = stack; } else { - stack = m->private; + stack = ctx->stack; } return stack; @@ -881,10 +886,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) { struct stack *stack = v; + struct stack_print_ctx *ctx = m->private; stack = stack->next; *ppos = stack ? *ppos + 1 : -1UL; - m->private = stack; + ctx->stack = stack; return stack; } @@ -929,7 +935,10 @@ static const struct seq_operations page_owner_stack_op = { static int page_owner_stack_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &page_owner_stack_op, 0); + int ret = seq_open_private(file, &page_owner_stack_op, + sizeof(struct stack_print_ctx)); + + return ret; } static const struct file_operations page_owner_stack_operations = { From 5c8ca473d5cb600f906820933cb0a8df44105045 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 1 Oct 2025 14:56:08 -0300 Subject: [PATCH 026/321] mm/page_owner: add struct stack_print_ctx.flags Add the flags field to stack_print_ctx, and define two flags for current behavior (printing stack traces and their number of base pages). The plumbing of flags is debugfs_create_file(data) -> inode.i_private -> page_owner_stack_open() -> stack_print_ctx.flags -> stack_print(). No behavior change intended. Link: https://lkml.kernel.org/r/20251001175611.575861-3-mfo@igalia.com Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Oscar Salvador Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 05e26c9d43ef..2f6f99d60ba7 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -45,8 +45,12 @@ static struct stack failure_stack; static struct stack *stack_list; static DEFINE_SPINLOCK(stack_list_lock); +#define STACK_PRINT_FLAG_STACK 0x1 +#define STACK_PRINT_FLAG_PAGES 0x2 + struct stack_print_ctx { struct stack *stack; + u8 flags; }; static bool page_owner_enabled __initdata; @@ -904,20 +908,24 @@ static int stack_print(struct seq_file *m, void *v) unsigned long *entries; unsigned long nr_entries; struct stack_record *stack_record = stack->stack_record; + struct stack_print_ctx *ctx = m->private; if (!stack->stack_record) return 0; - nr_entries = stack_record->size; - entries = stack_record->entries; nr_base_pages = refcount_read(&stack_record->count) - 1; if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) return 0; - for (i = 0; i < nr_entries; i++) - seq_printf(m, " %pS\n", (void *)entries[i]); - seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); + if (ctx->flags & STACK_PRINT_FLAG_STACK) { + nr_entries = stack_record->size; + entries = stack_record->entries; + for (i = 0; i < nr_entries; i++) + seq_printf(m, " %pS\n", (void *)entries[i]); + } + if (ctx->flags & STACK_PRINT_FLAG_PAGES) + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); return 0; } @@ -938,6 +946,13 @@ static int page_owner_stack_open(struct inode *inode, struct file *file) int ret = seq_open_private(file, &page_owner_stack_op, sizeof(struct stack_print_ctx)); + if (!ret) { + struct seq_file *m = file->private_data; + struct stack_print_ctx *ctx = m->private; + + ctx->flags = (uintptr_t) inode->i_private; + } + return ret; } @@ -976,7 +991,9 @@ static int __init pageowner_init(void) debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); dir = debugfs_create_dir("page_owner_stacks", NULL); - debugfs_create_file("show_stacks", 0400, dir, NULL, + debugfs_create_file("show_stacks", 0400, dir, + (void *)(STACK_PRINT_FLAG_STACK | + STACK_PRINT_FLAG_PAGES), &page_owner_stack_operations); debugfs_create_file("count_threshold", 0600, dir, NULL, &proc_page_owner_threshold); From 3b52b9e31a860df97bb46e9f3bfdab9f36d5d893 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 1 Oct 2025 14:56:09 -0300 Subject: [PATCH 027/321] mm/page_owner: add debugfs file 'show_handles' Add the flag STACK_PRINT_FLAG_HANDLE to print a stack's handle number from stackdepot, and add the file 'show_handles' to show just handles and their number of pages. This is similar to 'show_stacks', with handles instead of stack traces. Link: https://lkml.kernel.org/r/20251001175611.575861-4-mfo@igalia.com Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Oscar Salvador Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 2f6f99d60ba7..fe788f07b578 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -47,6 +47,7 @@ static DEFINE_SPINLOCK(stack_list_lock); #define STACK_PRINT_FLAG_STACK 0x1 #define STACK_PRINT_FLAG_PAGES 0x2 +#define STACK_PRINT_FLAG_HANDLE 0x4 struct stack_print_ctx { struct stack *stack; @@ -924,6 +925,8 @@ static int stack_print(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); } + if (ctx->flags & STACK_PRINT_FLAG_HANDLE) + seq_printf(m, "handle: %d\n", stack_record->handle.handle); if (ctx->flags & STACK_PRINT_FLAG_PAGES) seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); @@ -995,6 +998,10 @@ static int __init pageowner_init(void) (void *)(STACK_PRINT_FLAG_STACK | STACK_PRINT_FLAG_PAGES), &page_owner_stack_operations); + debugfs_create_file("show_handles", 0400, dir, + (void *)(STACK_PRINT_FLAG_HANDLE | + STACK_PRINT_FLAG_PAGES), + &page_owner_stack_operations); debugfs_create_file("count_threshold", 0600, dir, NULL, &proc_page_owner_threshold); From 5513cfbcf4da97ba0476d63e7670fdfcde59580f Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 1 Oct 2025 14:56:10 -0300 Subject: [PATCH 028/321] mm/page_owner: add debugfs file 'show_stacks_handles' Add the file 'show_stacks_handles' to show just stack traces and their handles, in order to resolve stack traces and handles (i.e., to identify the stack traces for handles in previous reads from 'show_handles'). All stacks/handles must show up, regardless of their number of pages, that might have become zero or no longer make 'count_threshold', but made it in previous reads from 'show_handles' -- and need to be resolved later. P.S.: now, print the extra newline independently of the number of pages. Link: https://lkml.kernel.org/r/20251001175611.575861-5-mfo@igalia.com Signed-off-by: Mauricio Faria de Oliveira Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index fe788f07b578..9c67bb8fb1d9 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -916,7 +916,8 @@ static int stack_print(struct seq_file *m, void *v) nr_base_pages = refcount_read(&stack_record->count) - 1; - if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) + if (ctx->flags & STACK_PRINT_FLAG_PAGES && + (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)) return 0; if (ctx->flags & STACK_PRINT_FLAG_STACK) { @@ -928,7 +929,8 @@ static int stack_print(struct seq_file *m, void *v) if (ctx->flags & STACK_PRINT_FLAG_HANDLE) seq_printf(m, "handle: %d\n", stack_record->handle.handle); if (ctx->flags & STACK_PRINT_FLAG_PAGES) - seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); + seq_printf(m, "nr_base_pages: %d\n", nr_base_pages); + seq_putc(m, '\n'); return 0; } @@ -1002,6 +1004,10 @@ static int __init pageowner_init(void) (void *)(STACK_PRINT_FLAG_HANDLE | STACK_PRINT_FLAG_PAGES), &page_owner_stack_operations); + debugfs_create_file("show_stacks_handles", 0400, dir, + (void *)(STACK_PRINT_FLAG_STACK | + STACK_PRINT_FLAG_HANDLE), + &page_owner_stack_operations); debugfs_create_file("count_threshold", 0600, dir, NULL, &proc_page_owner_threshold); From 0de9a442eeba4a6435af74120822b10b12ab8449 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 1 Oct 2025 14:56:11 -0300 Subject: [PATCH 029/321] mm/page_owner: update Documentation with 'show_handles' and 'show_stacks_handles' Describe and provide examples for 'show_handles' and 'show_stacks_handles'. Link: https://lkml.kernel.org/r/20251001175611.575861-6-mfo@igalia.com Signed-off-by: Mauricio Faria de Oliveira Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/mm/page_owner.rst | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 3a45a20fc05a..6b12f3b007ec 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -27,7 +27,10 @@ enabled. Other usages are more than welcome. It can also be used to show all the stacks and their current number of allocated base pages, which gives us a quick overview of where the memory is going without the need to screen through all the pages and match the -allocation and free operation. +allocation and free operation. It's also possible to show only a numeric +identifier of all the stacks (without stack traces) and their number of +allocated base pages (faster to read and parse, eg, for monitoring) that +can be matched with stacks later (show_handles and show_stacks_handles). page owner is disabled by default. So, if you'd like to use it, you need to add "page_owner=on" to your boot cmdline. If the kernel is built @@ -116,6 +119,33 @@ Usage nr_base_pages: 20824 ... + cat /sys/kernel/debug/page_owner_stacks/show_handles > handles_7000.txt + cat handles_7000.txt + handle: 42 + nr_base_pages: 20824 + ... + + cat /sys/kernel/debug/page_owner_stacks/show_stacks_handles > stacks_handles.txt + cat stacks_handles.txt + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + alloc_pages_mpol+0x22e/0x490 + folio_alloc+0xd5/0x110 + filemap_alloc_folio+0x78/0x230 + page_cache_ra_order+0x287/0x6f0 + filemap_get_pages+0x517/0x1160 + filemap_read+0x304/0x9f0 + xfs_file_buffered_read+0xe6/0x1d0 [xfs] + xfs_file_read_iter+0x1f0/0x380 [xfs] + __kernel_read+0x3b9/0x730 + kernel_read_file+0x309/0x4d0 + __do_sys_finit_module+0x381/0x730 + do_syscall_64+0x8d/0x150 + entry_SYSCALL_64_after_hwframe+0x62/0x6a + handle: 42 + ... + cat /sys/kernel/debug/page_owner > page_owner_full.txt ./page_owner_sort page_owner_full.txt sorted_page_owner.txt From 4dcf65bf5be22e32d389628b0e655731f97f525e Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Thu, 9 Oct 2025 12:29:30 -0700 Subject: [PATCH 030/321] mm/page_alloc: clarify batch tuning in zone_batchsize Patch series "mm/page_alloc: pcp->batch cleanups", v2. Two small cleanups for mm/page_alloc. Patch 1 cleans up a misleading comment about how pcp->batch is calculated, and folds in the calculation to increase clarity. No functional change intended. Patch 2 corrects zones from reporting that their pcp->batch is 0 when it is actually 1. Namely, corrects ZONE_DMA from reporting that its batch size is 0. This patch (of 2): Recently while working on another patch about batching free_pcppages_bulk [1], I was curious why pcp->batch was always 63 on my machine. This led me to zone_batchsize(), where I found this set of lines to determine what the batch size should be for the host: batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; All of this is good, except the comment above which says "We effectively *= 4 below". Nowhere else in the function zone_batchsize(), is there a corresponding multipliation by 4. Looking into the history of this, it seems like Dave Hansen had also noticed this back in 2013 [1]. Turns out there *used* to be a corresponding *= 4, which was turned into a *= 6 later on to be used in pageset_setup_from_batch_size(), which no longer exists. Despite this mismatch not being corrected in the comments, it seems that getting rid of the /= 4 leads to a performance regression on machines with less than 250G memory and 176 processors. As such, let us preserve the functionality but clean up the comments. Fold the /= 4 into the calculation above: bitshift by 10+2=12, and instead of dividing 1MB, divide 256KB and adjust the comments accordingly. No functional change intended. Link: https://lkml.kernel.org/r/20251009192933.3756712-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251009192933.3756712-2-joshua.hahnjy@gmail.com Link: https://lore.kernel.org/all/20251002204636.4016712-1-joshua.hahnjy@gmail.com/ [1] Signed-off-by: Joshua Hahn Suggested-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..39368cdc953d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5860,13 +5860,12 @@ static int zone_batchsize(struct zone *zone) int batch; /* - * The number of pages to batch allocate is either ~0.1% - * of the zone or 1MB, whichever is smaller. The batch + * The number of pages to batch allocate is either ~0.025% + * of the zone or 256KB, whichever is smaller. The batch * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); - batch /= 4; /* We effectively *= 4 below */ + batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE); if (batch < 1) batch = 1; From 2783088ef24e32df9d70eb2a24f70de28b476a05 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Thu, 9 Oct 2025 12:29:31 -0700 Subject: [PATCH 031/321] mm/page_alloc: prevent reporting pcp->batch = 0 zone_batchsize returns the appropriate value that should be used for pcp->batch. If it finds a zone with less than 4096 pages or PAGE_SIZE > 1M, however, it leads to some incorrect math. In the above case, we will get an intermediary value of 1, which is then rounded down to the nearest power of two, and 1 is subtracted from it. Since 1 is already a power of two, we will get batch = 1-1 = 0: batch = rounddown_pow_of_two(batch + batch/2) - 1; A pcp->batch value of 0 is nonsensical. If this were actually set, then functions like drain_zone_pages would become no-ops, since they could only free 0 pages at a time. Of the two callers of zone_batchsize, the one that is actually used to set pcp->batch works around this by setting pcp->batch to the maximum of 1 and zone_batchsize. However, the other caller, zone_pcp_init, incorrectly prints out the batch size of the zone to be 0. This is probably rare in a typical zone, but the DMA zone can often have less than 4096 pages, which means it will print out "LIFO batch:0". Before: [ 0.001216] DMA zone: 3998 pages, LIFO batch:0 After: [ 0.001210] DMA zone: 3998 pages, LIFO batch:1 Instead of dealing with the error handling and the mismatch between the reported and actual zone batchsize, just return 1 if the zone_batchsize is 1 page or less before the rounding. Link: https://lkml.kernel.org/r/20251009192933.3756712-3-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Vlastimil Babka Cc: Brendan Jackman Cc: Dave Hansen Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39368cdc953d..10a908793b4c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5866,8 +5866,8 @@ static int zone_batchsize(struct zone *zone) * and zone lock contention. */ batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE); - if (batch < 1) - batch = 1; + if (batch <= 1) + return 1; /* * Clamp the batch to a 2^n - 1 value. Having a power @@ -6018,7 +6018,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { int new_high_min, new_high_max, new_batch; - new_batch = max(1, zone_batchsize(zone)); + new_batch = zone_batchsize(zone); if (percpu_pagelist_high_fraction) { new_high_min = zone_highsize(zone, new_batch, cpu_online, percpu_pagelist_high_fraction); From a743e0af503a633e4ca68a100d9b2a1a071fe8ae Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 9 Oct 2025 18:24:30 +0100 Subject: [PATCH 032/321] mm/hugetlb: create hstate_is_gigantic_no_runtime helper This is a common condition used to skip operations that cannot be performed on gigantic pages when runtime support is disabled. This helper is introduced as the condition will exist even more when allowing "overcommit" of gigantic hugepages. No functional change intended with this patch. Link: https://lkml.kernel.org/r/20251009172433.4158118-1-usamaarif642@gmail.com Signed-off-by: Usama Arif Suggested-by: Andrew Morton Reviewed-by: Shakeel Butt Reviewed-by: Kefeng Wang Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Johannes Weiner Cc: Muchun Song Cc: Rik van Riel Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/hugetlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e016433e32e..bd53d367b18b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -135,6 +135,17 @@ static void hugetlb_free_folio(struct folio *folio) folio_put(folio); } +/* + * Check if the hstate represents gigantic pages but gigantic page + * runtime support is not available. This is a common condition used to + * skip operations that cannot be performed on gigantic pages when runtime + * support is disabled. + */ +static inline bool hstate_is_gigantic_no_runtime(struct hstate *h) +{ + return hstate_is_gigantic(h) && !gigantic_page_runtime_supported(); +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1535,7 +1546,7 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return; list_del(&folio->lru); @@ -1597,7 +1608,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, { bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return; /* @@ -2491,7 +2502,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) goto out; /* @@ -3705,7 +3716,7 @@ static void __init hugetlb_init_hstates(void) * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) continue; if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; @@ -4182,7 +4193,7 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, int err; nodemask_t nodes_allowed, *n_mask; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return -EINVAL; if (nid == NUMA_NO_NODE) { From eb02f14c4a2bf4c242d91c4a5d7fb57c3c0ad1b1 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 9 Oct 2025 18:24:31 +0100 Subject: [PATCH 033/321] mm/hugetlb: allow overcommitting gigantic hugepages Currently, gigantic hugepages cannot use the overcommit mechanism (nr_overcommit_hugepages), forcing users to permanently reserve memory via nr_hugepages even when pages might not be actively used. The restriction was added in 2011 [1], which was before there was support for reserving 1G hugepages at runtime. Remove this blanket restriction on gigantic hugepage overcommit. This will bring the same benefits to gigantic pages as hugepages: - Memory is only taken out of regular use when actually needed - Unused surplus pages can be returned to the system - Better memory utilization, especially with CMA backing which can significantly increase the changes of hugepage allocation Without this patch: echo 3 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_overcommit_hugepages bash: echo: write error: Invalid argument With this patch: echo 3 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_overcommit_hugepages ./mmap_hugetlb_test Successfully allocated huge pages at address: 0x7f9d40000000 cat mmap_hugetlb_test.c ... unsigned long ALLOC_SIZE = 3 * (unsigned long) HUGE_PAGE_SIZE; addr = mmap(NULL, ALLOC_SIZE, // 3GB PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_1GB, -1, 0); if (addr == MAP_FAILED) { fprintf(stderr, "mmap failed: %s\n", strerror(errno)); return 1; } printf("Successfully allocated huge pages at address: %p\n", addr); ... Link: https://lkml.kernel.org/r/20251009172433.4158118-2-usamaarif642@gmail.com Link: https://git.zx2c4.com/linux-rng/commit/mm/hugetlb.c?id=adbe8726dc2a3805630d517270db17e3af86e526 [1] Signed-off-by: Usama Arif Reviewed-by: Shakeel Butt Reviewed-by: Kefeng Wang Acked-by: Oscar Salvador Cc: David Hildenbrand Cc: Johannes Weiner Cc: Muchun Song Cc: Rik van Riel Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bd53d367b18b..7774c286b3b7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2223,7 +2223,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, { struct folio *folio = NULL; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic_no_runtime(h)) return NULL; spin_lock_irq(&hugetlb_lock); @@ -4285,7 +4285,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic_no_runtime(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); @@ -5172,7 +5172,7 @@ static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, tmp = h->nr_overcommit_huge_pages; - if (write && hstate_is_gigantic(h)) + if (write && hstate_is_gigantic_no_runtime(h)) return -EINVAL; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, From 95b34d66480bbc9bc31e78c26b1d5be47358ffc0 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Tue, 23 Sep 2025 00:10:18 -0700 Subject: [PATCH 034/321] mm: always call rmap_walk() on locked folios Patch series "Improve UFFDIO_MOVE scalability by removing anon_vma lock", v2. Userfaultfd has a scalability issue in its UFFDIO_MOVE ioctl, which is heavily used in Android as its java garbage collector uses it for concurrent heap compaction. The issue arises because UFFDIO_MOVE updates folio->mapping to an anon_vma with a different root, in order to move the folio from a src VMA to dst VMA. It performs the operation with the folio locked, but this is insufficient, because rmap_walk() can be performed on non-KSM anonymous folios without folio lock. This means that UFFDIO_MOVE has to acquire the anon_vma write lock of the root anon_vma belonging to the folio it wishes to move. This causes scalability bottleneck when multiple threads perform UFFDIO_MOVE simultanously on distinct pages of the same src VMA. In field traces of arm64 android devices, we have observed janky user interactions due to long (sometimes over ~50ms) uninterruptible sleeps on main UI thread caused by anon_vma lock contention in UFFDIO_MOVE. This is particularly severe during the beginning of GC's compaction phase when it is likely to have multiple threads involved. This patch resolves the issue by removing the exception in rmap_walk() for non-KSM anon folios by ensuring that all folios are locked during rmap walk. This is less problematic than it might seem, as the only major caller which utilises this mode is shrink_active_list(), which is covered in detail in the first patch of this series. As a result of changing our approach to locking, we can remove all the code that took steps to acquire an anon_vma write lock instead of a folio lock. This results in a significant simplification and scalability improvement of the code (currently only in UFFDIO_MOVE). Furthermore, as a side-effect, folio_lock_anon_vma_read() gets simpler as we don't need to worry that folio->mapping may have changed under us. This patch (of 2): Guarantee that rmap_walk() is called on locked folios so that threads changing folio->mapping and folio->index for non-KSM anon folios can serialize on fine-grained folio lock rather than anon_vma lock. Other folio types are already always locked before rmap_walk(). With this, we are going from 'not necessarily' locking the non-KSM anon folio to 'definitely' locking it during rmap walks. This patch is in preparation for removing anon_vma write-lock from UFFDIO_MOVE. With this patch, three functions are now expected to be called with a locked folio. To be careful of not missing any case, here is the exhaustive list of all their callers. 1) rmap_walk() is called from: a) folio_referenced() b) damon_folio_mkold() c) damon_folio_young() d) page_idle_clear_pte_refs() e) try_to_unmap() f) try_to_migrate() g) folio_mkclean() h) remove_migration_ptes() In the above list, first 4 are changed in this patch to try-lock non-KSM anon folios, similar to other types of folios. The remaining functions in the list already hold folio lock when calling rmap_walk(). 2) folio_lock_anon_vma_read() is called from following functions: a) collect_procs_anon() b) page_idle_clear_pte_refs() c) damon_folio_mkold() d) damon_folio_young() e) folio_referenced() f) try_to_unmap() g) try_to_migrate() All the functions in above list, except collect_procs_anon(), are covered by the rmap_walk() list above. For collect_procs_anon(), with kill_procs_now() changed to take folio lock in this patch ensures that all callers of folio_lock_anon_vma_read() now hold the lock. 3) folio_get_anon_vma() is called from following functions, all of which already hold the folio lock: a) move_pages_huge_pmd() b) __folio_split() c) move_pages_ptes() d) migrate_folio_unmap() e) unmap_and_move_huge_page() Functionally, this patch doesn't break the logic because rmap walkers generally do some other check to see if what is expected to mapped did happen so it's fine, or otherwise treat things as best-effort. Among the 4 functions changed in this patch, folio_referenced() is the only core-mm function, and is also frequently accessed. To assess the impact of locking non-KSM anon folios in shrink_active_list()->folio_referenced() path, we performed an app cycle test on an arm64 android device. During the whole duration of the test there were over 140k invocations of shrink_active_list(), out of which over 29k had at least one non-KSM anon folio on which folio_referenced() was called. In none of these invocations folio_trylock() failed. Of course, we now take a lock where we wouldn't previously have. In the past it would have had a major impact in causing a CoW write fault to copy a page in do_wp_page(), as commit 09854ba94c6a ("mm: do_wp_page() simplification") caused a failure to obtain folio lock to result in a page copy even if one wasn't necessary. However, since commit 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive"), and the introduction of the folio anon exclusive flag, this issue is significantly mitigated. The only case remaining that we might worry about from this perspective is that of read-only folios immediately after fork where the anon exclusive bit will not have been set yet. We note however in the case of read-only just-forked folios that wp_can_reuse_anon_folio() will notice the raised reference count established by shrink_active_list() via isolate_lru_folios() and refuse to reuse in any case, so this will in fact have no impact - the folio lock is ultimately immaterial here. All-in-all it appears that there is little opportunity for meaningful negative impact from this change. Link: https://lkml.kernel.org/r/20250923071019.775806-1-lokeshgidra@google.com Link: https://lkml.kernel.org/r/20250923071019.775806-2-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Acked-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Lorenzo Stoakes Cc: Harry Yoo Cc: Suren Baghdasaryan Cc: Barry Song Cc: SeongJae Park Cc: Jann Horn Cc: Kalesh Singh Cc: Nicolas Geoffray Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 16 ++++------------ mm/memory-failure.c | 3 +++ mm/page_idle.c | 8 ++------ mm/rmap.c | 42 ++++++++++++------------------------------ 4 files changed, 21 insertions(+), 48 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 998c5180a603..f61d6dde13dc 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -162,21 +162,17 @@ void damon_folio_mkold(struct folio *folio) .rmap_one = damon_folio_mkold_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); return; } - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); } @@ -228,7 +224,6 @@ bool damon_folio_young(struct folio *folio) .rmap_one = damon_folio_young_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) @@ -237,14 +232,11 @@ bool damon_folio_young(struct folio *folio) return true; } - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return false; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); return accessed; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3edebb0cda30..560884dd6250 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2140,7 +2140,10 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, { LIST_HEAD(tokill); + folio_lock(folio); collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); + folio_unlock(folio); + kill_procs(&tokill, true, pfn, flags); } diff --git a/mm/page_idle.c b/mm/page_idle.c index a82b340dc204..9bf573d22e87 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -101,19 +101,15 @@ static void page_idle_clear_pte_refs(struct folio *folio) .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) return; - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, diff --git a/mm/rmap.c b/mm/rmap.c index ac4f783d6ec2..3c3cf3efa5f6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -489,17 +489,15 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. * - * NOTE: the caller should normally hold folio lock when calling this. If - * not, the caller needs to double check the anon_vma didn't change after - * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it - * concurrently without folio lock protection). See folio_lock_anon_vma_read() - * which has already covered that, and comment above remap_pages(). + * NOTE: the caller should hold folio lock when calling this. */ struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) @@ -546,7 +544,8 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct anon_vma *root_anon_vma; unsigned long anon_mapping; -retry: + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) @@ -557,17 +556,6 @@ retry: anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { - /* - * folio_move_anon_rmap() might have changed the anon_vma as we - * might not hold the folio lock here. - */ - if (unlikely((unsigned long)READ_ONCE(folio->mapping) != - anon_mapping)) { - up_read(&root_anon_vma->rwsem); - rcu_read_unlock(); - goto retry; - } - /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will @@ -602,18 +590,6 @@ retry: rcu_read_unlock(); anon_vma_lock_read(anon_vma); - /* - * folio_move_anon_rmap() might have changed the anon_vma as we might - * not hold the folio lock here. - */ - if (unlikely((unsigned long)READ_ONCE(folio->mapping) != - anon_mapping)) { - anon_vma_unlock_read(anon_vma); - put_anon_vma(anon_vma); - anon_vma = NULL; - goto retry; - } - if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock @@ -988,7 +964,7 @@ int folio_referenced(struct folio *folio, int is_locked, if (!folio_raw_mapping(folio)) return 0; - if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { + if (!is_locked) { we_locked = folio_trylock(folio); if (!we_locked) return 1; @@ -2828,6 +2804,12 @@ static void rmap_walk_anon(struct folio *folio, pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; + /* + * The folio lock ensures that folio->mapping can't be changed under us + * to an anon_vma with different root. + */ + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ From cc22b9978509ec973b08457e72d81fbfe4d91ef2 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Tue, 23 Sep 2025 00:10:19 -0700 Subject: [PATCH 035/321] mm/userfaultfd: don't lock anon_vma when performing UFFDIO_MOVE Now that rmap_walk() is guaranteed to be called with the folio lock held, we can stop serializing on the src VMA anon_vma lock when moving an exclusive folio from a src VMA to a dst VMA in UFFDIO_MOVE ioctl. When moving a folio, we modify folio->mapping through folio_move_anon_rmap() and adjust folio->index accordingly. Doing that while we could have concurrent RMAP walks would be dangerous. Therefore, to avoid that, we had to acquire anon_vma of src VMA in write-mode. That meant that when multiple threads called UFFDIO_MOVE concurrently on distinct pages of the same src VMA, they would serialize on it, hurting scalability. In addition to avoiding the scalability bottleneck, this patch also simplifies the complicated lock dance that UFFDIO_MOVE has to go through between RCU, folio-lock, ptl, and anon_vma. folio_move_anon_rmap() already enforces that the folio is locked. So when we have the folio locked we can no longer race with concurrent rmap_walk() as used by folio_referenced() and others who call it on unlocked non-KSM anon folios, and therefore the anon_vma lock is no longer required. Note that this handling is now the same as for other folio_move_anon_rmap() users that also do not hold the anon_vma lock -- namely COW reuse handling (do_wp_page()->wp_can_reuse_anon_folio(), do_huge_pmd_wp_page(), and hugetlb_wp()). These users never required the anon_vma lock as they are only moving the anon VMA closer to the anon_vma leaf of the VMA, for example, from an anon_vma root to a leaf of that root. rmap walks were always able to tolerate that scenario. Link: https://lkml.kernel.org/r/20250923071019.775806-3-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Acked-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Lorenzo Stoakes Cc: Suren Baghdasaryan Cc: Barry Song Cc: Jann Horn Cc: Kalesh Singh Cc: Lokesh Gidra Cc: Nicolas Geoffray Cc: Harry Yoo Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/huge_memory.c | 22 +---------------- mm/userfaultfd.c | 62 +++++++++--------------------------------------- 2 files changed, 12 insertions(+), 72 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 32479ae27400..7ad9d2ba50d4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2534,7 +2534,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pmd_t _dst_pmd, src_pmdval; struct page *src_page; struct folio *src_folio; - struct anon_vma *src_anon_vma; spinlock_t *src_ptl, *dst_ptl; pgtable_t src_pgtable; struct mmu_notifier_range range; @@ -2583,23 +2582,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - if (src_folio) { + if (src_folio) folio_lock(src_folio); - /* - * split_huge_page walks the anon_vma chain without the page - * lock. Serialize against it with the anon_vma lock, the page - * lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - err = -EAGAIN; - goto unlock_folio; - } - anon_vma_lock_write(src_anon_vma); - } else - src_anon_vma = NULL; - dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); if (unlikely(!pmd_same(*src_pmd, src_pmdval) || @@ -2644,11 +2629,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } -unlock_folio: /* unblock rmap walks */ if (src_folio) folio_unlock(src_folio); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0630f188c847..00122f42718c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1035,8 +1035,7 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, */ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, unsigned long src_addr, - pte_t *src_pte, pte_t *dst_pte, - struct anon_vma *src_anon_vma) + pte_t *src_pte, pte_t *dst_pte) { pte_t orig_dst_pte, orig_src_pte; struct folio *folio; @@ -1052,8 +1051,7 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !folio_trylock(folio)) return NULL; - if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || - folio_anon_vma(folio) != src_anon_vma) { + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { folio_unlock(folio); return NULL; } @@ -1061,9 +1059,8 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, } /* - * Moves src folios to dst in a batch as long as they share the same - * anon_vma as the first folio, are not large, and can successfully - * take the lock via folio_trylock(). + * Moves src folios to dst in a batch as long as they are not large, and can + * successfully take the lock via folio_trylock(). */ static long move_present_ptes(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1073,8 +1070,7 @@ static long move_present_ptes(struct mm_struct *mm, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio **first_src_folio, unsigned long len, - struct anon_vma *src_anon_vma) + struct folio **first_src_folio, unsigned long len) { int err = 0; struct folio *src_folio = *first_src_folio; @@ -1132,8 +1128,8 @@ static long move_present_ptes(struct mm_struct *mm, src_pte++; folio_unlock(src_folio); - src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, - dst_pte, src_anon_vma); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, + src_pte, dst_pte); if (!src_folio) break; } @@ -1263,7 +1259,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd pmd_t dummy_pmdval; pmd_t dst_pmdval; struct folio *src_folio = NULL; - struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; long ret = 0; @@ -1347,9 +1342,9 @@ retry: } /* - * Pin and lock both source folio and anon_vma. Since we are in - * RCU read section, we can't block, so on contention have to - * unmap the ptes, obtain the lock and retry. + * Pin and lock source folio. Since we are in RCU read section, + * we can't block, so on contention have to unmap the ptes, + * obtain the lock and retry. */ if (!src_folio) { struct folio *folio; @@ -1423,33 +1418,11 @@ retry: goto retry; } - if (!src_anon_vma) { - /* - * folio_referenced walks the anon_vma chain - * without the folio lock. Serialize against it with - * the anon_vma lock, the folio lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - /* page was unmapped from under us */ - ret = -EAGAIN; - goto out; - } - if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - /* now we can block and wait */ - anon_vma_lock_write(src_anon_vma); - goto retry; - } - } - ret = move_present_ptes(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, - len, src_anon_vma); + len); } else { struct folio *folio = NULL; @@ -1515,10 +1488,6 @@ retry: } out: - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); @@ -1792,15 +1761,6 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. - * - * If there's any rmap walk that is taking the anon_vma locks without - * first obtaining the folio lock (the only current instance is - * folio_referenced), they will have to verify if the folio->mapping - * has changed after taking the anon_vma lock. If it changed they - * should release the lock and retry obtaining a new anon_vma, because - * it means the anon_vma was changed by move_pages() before the lock - * could be obtained. This is the only additional complexity added to - * the rmap code to provide this anonymous page remapping functionality. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) From 27109f5703398d87727affdabd408f7802925e67 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Thu, 9 Oct 2025 20:54:02 +0500 Subject: [PATCH 036/321] kasan: remove __kasan_save_free_info wrapper Patch series "kasan: cleanups for kasan_enabled() checks". This patch series is the continuation of [1] the previous discussion related to the KASAN internal refactoring. Here we remove kasan_enabled() checks which are duplicated by higher callers. These checks deduplication are also related to the separate patch series [2]. This patch (of 2): We don't need a kasan_enabled() check in kasan_save_free_info() at all. Both the higher level paths (kasan_slab_free and kasan_mempool_poison_object) already contain this check. Therefore, remove the __wrapper. Link: https://lkml.kernel.org/r/20251009155403.1379150-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20251009155403.1379150-2-snovitoll@gmail.com Link: https://lore.kernel.org/all/CA+fCnZce3AR+pUesbDkKMtMJ+iR8eDrcjFTbVpAcwjBoZ=gJnQ@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/aNTfPjS2buXMI46D@MiWiFi-R3L-srv/ [2] Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Christophe Leroy Cc: Dmitriy Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 2 +- mm/kasan/kasan.h | 7 +------ mm/kasan/tags.c | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b413c46b3e04..516b49accc4f 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -573,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_save_track(&alloc_meta->alloc_track, flags); } -void __kasan_save_free_info(struct kmem_cache *cache, void *object) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 07fa7375a848..fc9169a54766 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -399,12 +399,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack); void kasan_save_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void __kasan_save_free_info(struct kmem_cache *cache, void *object); -static inline void kasan_save_free_info(struct kmem_cache *cache, void *object) -{ - if (kasan_enabled()) - __kasan_save_free_info(cache, object); -} +void kasan_save_free_info(struct kmem_cache *cache, void *object); #ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index b9f31293622b..d65d48b85f90 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) save_stack_info(cache, object, flags, false); } -void __kasan_save_free_info(struct kmem_cache *cache, void *object) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { save_stack_info(cache, object, 0, true); } From ada5cbe33a5321f8c896a3362c3aafa0bf262110 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Thu, 9 Oct 2025 20:54:03 +0500 Subject: [PATCH 037/321] kasan: cleanup of kasan_enabled() checks Deduplication of kasan_enabled() checks which are already used by callers. * Altered functions: check_page_allocation Delete the check because callers have it already in __wrappers in include/linux/kasan.h: __kasan_kfree_large __kasan_mempool_poison_pages __kasan_mempool_poison_object kasan_populate_vmalloc, kasan_release_vmalloc Add __wrappers in include/linux/kasan.h. They are called externally in mm/vmalloc.c. __kasan_unpoison_vmalloc, __kasan_poison_vmalloc Delete checks because there're already kasan_enabled() checks in respective __wrappers in include/linux/kasan.h. release_free_meta -- Delete the check because the higher caller path has it already. See the stack trace: __kasan_slab_free -- has the check already __kasan_mempool_poison_object -- has the check already poison_slab_object kasan_save_free_info release_free_meta kasan_enabled() -- Delete here Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Christophe Leroy Cc: Dmitriy Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 20 ++++++++++++++++++-- mm/kasan/common.c | 3 --- mm/kasan/generic.c | 3 --- mm/kasan/shadow.c | 20 ++++---------------- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..22e5d67ff064 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_enabled()) - return false; - if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 516b49accc4f..2b8e73f5f6a7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_enabled()) - return; - /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index a30d84bfdd52..29a751a8a08d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) +static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; @@ -395,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; - if (!kasan_enabled()) - return 0; - if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -424,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); + ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -566,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * pages entirely covered by the free region, we will not run in to any * trouble - any simultaneous allocations will be for disjoint regions. */ -void kasan_release_vmalloc(unsigned long start, unsigned long end, +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) @@ -575,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_enabled()) - return; - region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -626,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_enabled()) - return (void *)start; - if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -651,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_enabled()) - return; - if (!is_vmalloc_or_module_addr(start)) return; From 9686080d627ae95e9a60de29518c07843aad3f77 Mon Sep 17 00:00:00 2001 From: Song Hu Date: Tue, 30 Sep 2025 17:21:51 +0800 Subject: [PATCH 038/321] mm/page_owner: rename proc-prefixed variables for clarity `proc_page_owner_operations` and related variables were renamed to `page_owner_fops` to better reflect their association with `debugfs` rather than `/proc`. This improves code clarity and aligns with kernel naming conventions. Link: https://lkml.kernel.org/r/20250930092153.843109-1-husong@kylinos.cn Signed-off-by: Song Hu Acked-by: Vlastimil Babka Reviewed-by: Ye Liu Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 9c67bb8fb1d9..8ebc2de1c110 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -860,7 +860,7 @@ static void init_early_allocated_pages(void) init_zones_in_node(pgdat); } -static const struct file_operations proc_page_owner_operations = { +static const struct file_operations page_owner_fops = { .read = read_page_owner, .llseek = lseek_page_owner, }; @@ -961,7 +961,7 @@ static int page_owner_stack_open(struct inode *inode, struct file *file) return ret; } -static const struct file_operations page_owner_stack_operations = { +static const struct file_operations page_owner_stack_fops = { .open = page_owner_stack_open, .read = seq_read, .llseek = seq_lseek, @@ -980,7 +980,7 @@ static int page_owner_threshold_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, +DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get, &page_owner_threshold_set, "%llu"); @@ -993,24 +993,22 @@ static int __init pageowner_init(void) return 0; } - debugfs_create_file("page_owner", 0400, NULL, NULL, - &proc_page_owner_operations); + debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops); dir = debugfs_create_dir("page_owner_stacks", NULL); debugfs_create_file("show_stacks", 0400, dir, (void *)(STACK_PRINT_FLAG_STACK | STACK_PRINT_FLAG_PAGES), - &page_owner_stack_operations); + &page_owner_stack_fops); debugfs_create_file("show_handles", 0400, dir, (void *)(STACK_PRINT_FLAG_HANDLE | STACK_PRINT_FLAG_PAGES), - &page_owner_stack_operations); + &page_owner_stack_fops); debugfs_create_file("show_stacks_handles", 0400, dir, (void *)(STACK_PRINT_FLAG_STACK | STACK_PRINT_FLAG_HANDLE), - &page_owner_stack_operations); + &page_owner_stack_fops); debugfs_create_file("count_threshold", 0600, dir, NULL, - &proc_page_owner_threshold); - + &page_owner_threshold_fops); return 0; } late_initcall(pageowner_init) From ca30ac479e6cf7a210dcad32fa2ee99ca0357e91 Mon Sep 17 00:00:00 2001 From: Song Hu Date: Tue, 30 Sep 2025 17:21:52 +0800 Subject: [PATCH 039/321] mm/page_owner: simplify zone iteration logic in init_early_allocated_pages() The current implementation uses nested loops: first iterating over all online nodes, then over zones within each node. This can be simplified by using the for_each_populated_zone() macro which directly iterates through all populated zones. This change: 1. Removes the intermediate init_zones_in_node() function 2. Simplifies init_early_allocated_pages() to use direct zone iteration 3. Updates init_pages_in_zone() to take only zone parameter and access node_id via zone->zone_pgdat The functionality remains identical, but the code is cleaner and more maintainable. Link: https://lkml.kernel.org/r/20250930092153.843109-2-husong@kylinos.cn Signed-off-by: Song Hu Reviewed-by: Vlastimil Babka Reviewed-by: Ye Liu Acked-by: Vlastimil Babka Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 8ebc2de1c110..a70245684206 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -769,7 +769,7 @@ static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) return file->f_pos; } -static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) +static void init_pages_in_zone(struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); @@ -836,28 +836,15 @@ ext_put_continue: } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", - pgdat->node_id, zone->name, count); -} - -static void init_zones_in_node(pg_data_t *pgdat) -{ - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - - init_pages_in_zone(pgdat, zone); - } + zone->zone_pgdat->node_id, zone->name, count); } static void init_early_allocated_pages(void) { - pg_data_t *pgdat; + struct zone *zone; - for_each_online_pgdat(pgdat) - init_zones_in_node(pgdat); + for_each_populated_zone(zone) + init_pages_in_zone(zone); } static const struct file_operations page_owner_fops = { From a739e6b557af0939ed4031419374f2c563e95b68 Mon Sep 17 00:00:00 2001 From: Yadong Qi Date: Fri, 10 Oct 2025 09:43:11 +0800 Subject: [PATCH 040/321] mm: vmalloc: WARN_ON if mapping size is not PAGE_SIZE aligned In mm/vmalloc.c, the function vmap_pte_range() assumes that the mapping size is aligned to PAGE_SIZE. If this assumption is violated, the loop will become infinite because the termination condition (`addr != end`) will never be met. This can lead to overwriting other VA ranges and/or random pages physically follow the page table. It's the caller's responsibility to ensure that the mapping size is aligned to PAGE_SIZE. However, the memory corruption is hard to root cause. To identify the programming error in the caller easier, check whether the mapping size is PAGE_SIZE aligned with WARN_ON_ONCE(). [yadong.qi@linux.alibaba.com: fix uninitialized value issue] Closes: https://lore.kernel.org/r/202510110050.VG9YKMRK-lkp@intel.com/ Link: https://lkml.kernel.org/r/20251010014311.1689-1-yadong.qi@linux.alibaba.com Signed-off-by: Yadong Qi Reviewed-by: Huang Ying Reviewed-by: Dev Jain Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a63c91c6150..dcc95931ea9d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -100,6 +100,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct page *page; unsigned long size = PAGE_SIZE; + if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr))) + return -EINVAL; + pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) @@ -167,6 +170,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, { pmd_t *pmd; unsigned long next; + int err = 0; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) @@ -180,10 +184,11 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, continue; } - if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) - return -ENOMEM; + err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask); + if (err) + break; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); - return 0; + return err; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, @@ -217,6 +222,7 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, { pud_t *pud; unsigned long next; + int err = 0; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) @@ -230,11 +236,11 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, continue; } - if (vmap_pmd_range(pud, addr, next, phys_addr, prot, - max_page_shift, mask)) - return -ENOMEM; + err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask); + if (err) + break; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); - return 0; + return err; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, @@ -268,6 +274,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, { p4d_t *p4d; unsigned long next; + int err = 0; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) @@ -281,11 +288,11 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, continue; } - if (vmap_pud_range(p4d, addr, next, phys_addr, prot, - max_page_shift, mask)) - return -ENOMEM; + err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask); + if (err) + break; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); - return 0; + return err; } static int vmap_range_noflush(unsigned long addr, unsigned long end, From 8cb290dd4b82ab8dd773a36b918fcad2439d2147 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 13 Oct 2025 19:42:22 +0200 Subject: [PATCH 041/321] vmalloc: update __vmalloc_node_noprof() documentation The kernel-doc for __vmalloc_node_noprof() incorrectly states that __GFP_NOFAIL reclaim modifier is not supported. In fact it has been supported since commit 9376130c390a ("mm/vmalloc: add support for __GFP_NOFAIL"). To avoid duplication and future drift, point this helper's doc to __vmalloc_node_range_noprof() for details and the full description. Link: https://lkml.kernel.org/r/20251013174222.90123-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dcc95931ea9d..e207ca64a688 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4034,11 +4034,8 @@ fail: * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL - * and __GFP_NOFAIL are not supported - * - * Any use of gfp flags outside of GFP_KERNEL should be consulted - * with mm people. + * Semantics of @gfp_mask(including reclaim/retry modifiers such as + * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof(). * * Return: pointer to the allocated memory or %NULL on error */ From 900fcf00e16844423aa2b08c4999c5773a7a29ec Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 13 Oct 2025 17:56:20 +0800 Subject: [PATCH 042/321] mm: remove the BOUNCE config option Commit eeadd68e2a5f ("block: remove bounce buffering support") remove block/bounce.c but left the BOUNCE config option. Now this option has no users, so remove it. Link: https://lkml.kernel.org/r/20251013095620.1111061-1-chenhuacai@loongson.cn Signed-off-by: Huacai Chen Acked-by: David Hildenbrand Acked-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: John Garry Signed-off-by: Andrew Morton --- mm/Kconfig | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index ca3f146bc705..4971436c8697 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -695,15 +695,6 @@ config PCP_BATCH_SCALE_MAX config PHYS_ADDR_T_64BIT def_bool 64BIT -config BOUNCE - bool "Enable bounce buffers" - default y - depends on BLOCK && MMU && HIGHMEM - help - Enable bounce buffers for devices that cannot access the full range of - memory available to the CPU. Enabled by default when HIGHMEM is - selected, but you may say n to override this. - config MMU_NOTIFIER bool select INTERVAL_TREE From eb8762dc220c0b0573100a941bfc68df34ece74f Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:16 +0530 Subject: [PATCH 043/321] drivers/base/node: fold register_node() into register_one_node() Patch series "drivers/base/node: fold node register and unregister functions", v2. The first patch merges register_one_node() and register_node(), leaving a single register_node() function. The second patch merges unregister_one_node() and unregister_node(), leaving a single unregister_node() function. There are no functional changes in these patches. This patch (of 2): register_node() is only called from register_one_node(). This patch folds register_node() into its only caller and renames register_one_node() to register_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [akpm@linux-foundation.org: fix kerneldoc, per David] Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- arch/x86/mm/numa.c | 4 +- drivers/base/node.c | 52 +++++++++------------- include/linux/node.h | 4 +- mm/memory_hotplug.c | 4 +- mm/mm_init.c | 2 +- 6 files changed, 28 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index aeb8633a3d00..8c77ec7980de 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) nid = of_node_to_nid(dn); if (likely((nid) >= 0)) { if (!node_online(nid)) { - if (register_one_node(nid)) { + if (register_node(nid)) { pr_err("PCI: Failed to register node %d\n", nid); } else { update_numa_distance(dn); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c24890c40138..7a97327140df 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -262,7 +262,7 @@ void __init init_gi_nodes(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ @@ -303,7 +303,7 @@ void __init init_cpu_to_node(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ diff --git a/drivers/base/node.c b/drivers/base/node.c index 83aeb0518e1d..17d7b90403ff 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,33 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/* - * register_node - Setup a sysfs device for a node. - * @num - Node number to use when creating the device. - * - * Initialize and register the node device. - */ -static int register_node(struct node *node, int num) -{ - int error; - - node->dev.id = num; - node->dev.bus = &node_subsys; - node->dev.release = node_device_release; - node->dev.groups = node_dev_groups; - error = device_register(&node->dev); - - if (error) { - put_device(&node->dev); - } else { - hugetlb_register_node(node); - compaction_register_node(node); - reclaim_register_node(node); - } - - return error; -} - /** * unregister_node - unregister a node device * @node: node going away @@ -907,7 +880,13 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn, } #endif /* CONFIG_MEMORY_HOTPLUG */ -int register_one_node(int nid) +/** + * register_node - Initialize and register the node device. + * @nid: Node number to use when creating the device. + * + * Return: 0 on success, -errno otherwise + */ +int register_node(int nid) { int error; int cpu; @@ -918,14 +897,23 @@ int register_one_node(int nid) return -ENOMEM; INIT_LIST_HEAD(&node->access_list); - node_devices[nid] = node; - error = register_node(node_devices[nid], nid); + node->dev.id = nid; + node->dev.bus = &node_subsys; + node->dev.release = node_device_release; + node->dev.groups = node_dev_groups; + + error = device_register(&node->dev); if (error) { - node_devices[nid] = NULL; + put_device(&node->dev); return error; } + node_devices[nid] = node; + hugetlb_register_node(node); + compaction_register_node(node); + reclaim_register_node(node); + /* link cpu under this node */ for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == nid) @@ -1018,7 +1006,7 @@ void __init node_dev_init(void) * to already created cpu devices. */ for_each_online_node(i) { - ret = register_one_node(i); + ret = register_node(i); if (ret) panic("%s() failed to add node: %d\n", __func__, ret); } diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..b7028d3ec3b4 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -176,7 +176,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); +int register_node(int nid); extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -189,7 +189,7 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..6c050d867031 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1311,7 +1311,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1542,7 +1542,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error_memblock_remove; if (ret) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); if (WARN_ON(ret)) { node_set_offline(nid); goto error_memblock_remove; diff --git a/mm/mm_init.c b/mm/mm_init.c index 7712d887b696..c6812b4dbb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarchy will be created via register_one_node() + * No sysfs hierarchy will be created via register_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of From d945667dcb1996ddf00ffa8408b579e4ce573652 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:17 +0530 Subject: [PATCH 044/321] drivers/base/node: fold unregister_node() into unregister_one_node() unregister_node() is only called from unregister_one_node(). This patch folds unregister_node() into its only caller and renames unregister_one_node() to unregister_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [donettom@linux.ibm.com: remove extra spaces before @nid and "All"] Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- drivers/base/node.c | 38 +++++++++++++++++--------------------- include/linux/node.h | 6 ++---- mm/memory_hotplug.c | 4 ++-- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 17d7b90403ff..00cf4532f121 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,23 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/** - * unregister_node - unregister a node device - * @node: node going away - * - * Unregisters a node device @node. All the devices on the node must be - * unregistered before calling this function. - */ -void unregister_node(struct node *node) -{ - hugetlb_unregister_node(node); - compaction_unregister_node(node); - reclaim_unregister_node(node); - node_remove_accesses(node); - node_remove_caches(node); - device_unregister(&node->dev); -} - struct node *node_devices[MAX_NUMNODES]; /* @@ -924,13 +907,26 @@ int register_node(int nid) return error; } - -void unregister_one_node(int nid) +/** + * unregister_node - unregister a node device + * @nid: nid of the node going away + * + * Unregisters the node device at node id @nid. All the devices on the + * node must be unregistered before calling this function. + */ +void unregister_node(int nid) { - if (!node_devices[nid]) + struct node *node = node_devices[nid]; + + if (!node) return; - unregister_node(node_devices[nid]); + hugetlb_unregister_node(node); + compaction_unregister_node(node); + reclaim_unregister_node(node); + node_remove_accesses(node); + node_remove_caches(node); + device_unregister(&node->dev); node_devices[nid] = NULL; } diff --git a/include/linux/node.h b/include/linux/node.h index b7028d3ec3b4..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -177,7 +175,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ int register_node(int nid); -extern void unregister_one_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -193,7 +191,7 @@ static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c050d867031..94a8f6e8811a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1596,7 +1596,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) error: if (new_node) { node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) @@ -2201,7 +2201,7 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); From 03aa8e4f273284a6abf28c0d86529cf3947328b2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 23 Oct 2025 19:37:34 +0800 Subject: [PATCH 045/321] mm: mprotect: always skip dma pinned folio in prot_numa_skip() Patch series "mm: some optimizations for prot numa", v5. This patch (of 4): If the folio (even not CoW folio) is dma pinned, it can't be migrated due to the elevated reference count. So always skip a pinned folio to avoid wasting cycles when folios are migrated. Link: https://lkml.kernel.org/r/20251023113737.3572790-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20251023113737.3572790-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Barry Song Reviewed-by: Dev Jain Reviewed-by: Lance Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Liam Howlett Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/mprotect.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 988c366137d5..056986d9076a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -136,9 +136,12 @@ static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, if (folio_is_zone_device(folio) || folio_test_ksm(folio)) goto skip; - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) + /* Also skip shared copy-on-write folios */ + if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) + goto skip; + + /* Folios are pinned and can't be migrated */ + if (folio_maybe_dma_pinned(folio)) goto skip; /* From 6e97624dacc1a3599bae3724c79f1942e11c2912 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 23 Oct 2025 19:37:35 +0800 Subject: [PATCH 046/321] mm: mprotect: avoid unnecessary struct page accessing if pte_protnone() If the pte_protnone() is true, we could avoid unnecessary struct page accessing and reduce cache footprint when scanning page tables for prot numa, there was a similar change before, see more commit a818f5363a0e ("autonuma: reduce cache footprint when scanning page tables"). Link: https://lkml.kernel.org/r/20251023113737.3572790-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Dev Jain Reviewed-by: Sidhartha Kumar Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Barry Song Cc: Lance Yang Cc: Liam Howlett Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/mprotect.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 056986d9076a..6236d120c8e6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -118,18 +118,13 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags); } -static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, - pte_t oldpte, pte_t *pte, int target_node, - struct folio *folio) +static bool prot_numa_skip(struct vm_area_struct *vma, int target_node, + struct folio *folio) { bool ret = true; bool toptier; int nid; - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - goto skip; - if (!folio) goto skip; @@ -307,23 +302,25 @@ static long change_pte_range(struct mmu_gather *tlb, struct page *page; pte_t ptent; + /* Already in the desired state. */ + if (prot_numa && pte_protnone(oldpte)) + continue; + page = vm_normal_page(vma, addr, oldpte); if (page) folio = page_folio(page); + /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. */ - if (prot_numa) { - int ret = prot_numa_skip(vma, addr, oldpte, pte, - target_node, folio); - if (ret) { + if (prot_numa && + prot_numa_skip(vma, target_node, folio)) { - /* determine batch to skip */ - nr_ptes = mprotect_folio_pte_batch(folio, - pte, oldpte, max_nr_ptes, /* flags = */ 0); - continue; - } + /* determine batch to skip */ + nr_ptes = mprotect_folio_pte_batch(folio, + pte, oldpte, max_nr_ptes, /* flags = */ 0); + continue; } nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); From ca43034cdb224131f2ff70a914f3dc43eaa2f516 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 23 Oct 2025 19:37:36 +0800 Subject: [PATCH 047/321] mm: mprotect: convert to folio_can_map_prot_numa() The prot_numa_skip() naming is not good since it updates the folio access time except checking whether to skip prot NUMA, so rename it to folio_can_map_prot_numa(), and cleanup it a bit, remove ret by directly return value instead of goto style. Adding a new helper vma_is_single_threaded_private() to check whether it's a single threaded private VMA, and make folio_can_map_prot_numa() a non-static function so that they could be reused in change_huge_pmd(), since folio_can_map_prot_numa() will be shared in different paths, let's move it near change_prot_numa() in mempolicy.c. Link: https://lkml.kernel.org/r/20251023113737.3572790-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Ryan Roberts Cc: Sidhartha Kumar Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/internal.h | 20 +++++++++++++++ mm/mempolicy.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 67 ++++---------------------------------------------- 3 files changed, 86 insertions(+), 62 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index e623c8103358..56a9a714709a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1378,6 +1378,26 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); +static inline bool vma_is_single_threaded_private(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return false; + + return atomic_read(&vma->vm_mm->mm_users) == 1; +} + +#ifdef CONFIG_NUMA_BALANCING +bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, + bool is_private_single_threaded); + +#else +static inline bool folio_can_map_prot_numa(struct folio *folio, + struct vm_area_struct *vma, bool is_private_single_threaded) +{ + return false; +} +#endif + int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb83cff7db8c..7ae3f5e2dee6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -99,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -803,6 +805,65 @@ unlock: } #ifdef CONFIG_NUMA_BALANCING +/** + * folio_can_map_prot_numa() - check whether the folio can map prot numa + * @folio: The folio whose mapping considered for being made NUMA hintable + * @vma: The VMA that the folio belongs to. + * @is_private_single_threaded: Is this a single-threaded private VMA or not + * + * This function checks to see if the folio actually indicates that + * we need to make the mapping one which causes a NUMA hinting fault, + * as there are cases where it's simply unnecessary, and the folio's + * access time is adjusted for memory tiering if prot numa needed. + * + * Return: True if the mapping of the folio needs to be changed, false otherwise. + */ +bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, + bool is_private_single_threaded) +{ + int nid; + + if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) + return false; + + /* Also skip shared copy-on-write folios */ + if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) + return false; + + /* Folios are pinned and can't be migrated */ + if (folio_maybe_dma_pinned(folio)) + return false; + + /* + * While migration can move some dirty folios, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + return false; + + /* + * Don't mess with PTEs if folio is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (is_private_single_threaded && (nid == numa_node_id())) + return false; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) + return false; + + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + + return true; +} + /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these diff --git a/mm/mprotect.c b/mm/mprotect.c index 6236d120c8e6..ab4e06cd9a69 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,9 +29,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -118,60 +116,6 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags); } -static bool prot_numa_skip(struct vm_area_struct *vma, int target_node, - struct folio *folio) -{ - bool ret = true; - bool toptier; - int nid; - - if (!folio) - goto skip; - - if (folio_is_zone_device(folio) || folio_test_ksm(folio)) - goto skip; - - /* Also skip shared copy-on-write folios */ - if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) - goto skip; - - /* Folios are pinned and can't be migrated */ - if (folio_maybe_dma_pinned(folio)) - goto skip; - - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && folio_test_dirty(folio)) - goto skip; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid = folio_nid(folio); - if (target_node == nid) - goto skip; - - toptier = node_is_toptier(nid); - - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) - goto skip; - - ret = false; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); - -skip: - return ret; -} - /* Set nr_ptes number of ptes, starting from idx */ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, @@ -274,7 +218,7 @@ static long change_pte_range(struct mmu_gather *tlb, pte_t *pte, oldpte; spinlock_t *ptl; long pages = 0; - int target_node = NUMA_NO_NODE; + bool is_private_single_threaded; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -285,10 +229,8 @@ static long change_pte_range(struct mmu_gather *tlb, if (!pte) return -EAGAIN; - /* Get target node for single threaded private VMAs */ - if (prot_numa && !(vma->vm_flags & VM_SHARED) && - atomic_read(&vma->vm_mm->mm_users) == 1) - target_node = numa_node_id(); + if (prot_numa) + is_private_single_threaded = vma_is_single_threaded_private(vma); flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); @@ -315,7 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa && - prot_numa_skip(vma, target_node, folio)) { + !folio_can_map_prot_numa(folio, vma, + is_private_single_threaded)) { /* determine batch to skip */ nr_ptes = mprotect_folio_pte_batch(folio, From f66e2727ddfcbbe3dbb459e809824f721a914464 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 23 Oct 2025 19:37:37 +0800 Subject: [PATCH 048/321] mm: huge_memory: use folio_can_map_prot_numa() for pmd folio The folio_can_map_prot_numa() checks whether the folio can map prot numa, which skips unsuitable folio, i.e. zone device, shared folios (KSM, CoW), non-movable dma pinned, dirty file folio and folios that already have the expected node affinity. Although the ksm only applies to small folios, an extra test was added for large folios, but the other policies should be applied to pmd folio, which helps to avoid unnecessary pmd change and folio migration attempts. Link: https://lkml.kernel.org/r/20251023113737.3572790-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Sidhartha Kumar Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ad9d2ba50d4..ffdf2ccf8269 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2396,8 +2396,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #endif if (prot_numa) { - struct folio *folio; - bool toptier; + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -2409,19 +2408,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - folio = pmd_folio(*pmd); - toptier = node_is_toptier(folio_nid(folio)); - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) + if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma, + vma_is_single_threaded_private(vma))) goto unlock; - - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:08 -0700 Subject: [PATCH 049/321] mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5. Motivation & Approach ===================== While testing workloads with high sustained memory pressure on large machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly high number of softlockups. Further investigation showed that the zone lock in free_pcppages_bulk was being held for a long time, and was called to free 2k+ pages over 100 times just during boot. This causes starvation in other processes for the zone lock, which can lead to the system stalling as multiple threads cannot make progress without the locks. We can see these issues manifesting as warnings: [ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU [ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426 [ 4512.626401] rcu: hardirqs softirqs csw/system [ 4512.638793] rcu: number: 0 145 0 [ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms) [ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316) While these warnings don't indicate a crash or a kernel panic, they do point to the underlying issue of lock contention. To prevent starvation in both locks, batch the freeing of pages using pcp->batch. Because free_pcppages_bulk is called with the pcp lock and acquires the zone lock, relinquishing and reacquiring the locks are only effective when both of them are broken together (unless the system was built with queued spinlocks). Thus, instead of modifying free_pcppages_bulk to break both locks, batch the freeing from its callers instead. A similar fix has been implemented in the Meta fleet, and we have seen significantly less softlockups. Testing ======= The following are a few synthetic benchmarks, made on three machines. The first is a large machine with 754GiB memory and 316 processors. The second is a relatively smaller machine with 251GiB memory and 176 processors. The third and final is the smallest of the three, which has 62GiB memory and 36 processors. On all machines, I kick off a kernel build with -j$(nproc). Negative delta is better (faster compilation). Large machine (754GiB memory, 316 processors) make -j$(nproc) +------------+---------------+-----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+-----------+ | real | 0.8070 | - 1.4865 | | user | 0.2823 | + 0.4081 | | sys | 5.0267 | -11.8737 | +------------+---------------+-----------+ Medium machine (251GiB memory, 176 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.2806 | +0.0351 | | user | 0.0994 | +0.3170 | | sys | 0.6229 | -0.6277 | +------------+---------------+----------+ Small machine (62GiB memory, 36 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.1503 | -2.6585 | | user | 0.0431 | -2.2984 | | sys | 0.1870 | -3.2013 | +------------+---------------+----------+ Here, variation is the coefficient of variation, i.e. standard deviation / mean. Based on these results, it seems like there are varying degrees to how much lock contention this reduces. For the largest and smallest machines that I ran the tests on, it seems like there is quite some significant reduction. There is also some performance increases visible from userspace. Interestingly, the performance gains don't scale with the size of the machine, but rather there seems to be a dip in the gain there is for the medium-sized machine. One possible theory is that because the high watermark depends on both memory and the number of local CPUs, what impacts zone contention the most is not these individual values, but rather the ratio of mem:processors. This patch (of 5): Currently, refresh_cpu_vm_stats returns an int, indicating how many changes were made during its updates. Using this information, callers like vmstat_update can heuristically determine if more work will be done in the future. However, all of refresh_cpu_vm_stats's callers either (a) ignore the result, only caring about performing the updates, or (b) only care about whether changes were made, but not *how many* changes were made. Simplify the code by returning a bool instead to indicate if updates were made. In addition, simplify fold_diff and decay_pcp_high to return a bool for the same reason. Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: Vlastimil Babka Reviewed-by: SeongJae Park Cc: Brendan Jackman Cc: Chris Mason Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- mm/page_alloc.c | 8 ++++---- mm/vmstat.c | 28 +++++++++++++++------------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10a908793b4c..f057ce5ea7da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2557,10 +2557,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, batch; - int todo = 0; + bool todo = false; high_min = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); @@ -2573,7 +2573,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), pcp->high - (pcp->high >> 3), high_min); if (pcp->high > high_min) - todo++; + todo = true; } to_drain = pcp->count - pcp->high; @@ -2581,7 +2581,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); spin_unlock(&pcp->lock); - todo++; + todo = true; } return todo; diff --git a/mm/vmstat.c b/mm/vmstat.c index bb09c032eecf..98855f31294d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state); /* * Fold a differential into the global counters. - * Returns the number of counters updated. + * Returns whether counters were updated. */ static int fold_diff(int *zone_diff, int *node_diff) { int i; - int changes = 0; + bool changed = false; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; + changed = true; } for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + changed = true; } - return changes; + return changed; } /* @@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int *node_diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. * - * The function returns the number of global counters updated. + * The function returns whether global counters were updated. */ -static int refresh_cpu_vm_stats(bool do_pagesets) +static bool refresh_cpu_vm_stats(bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; - int changes = 0; + bool changed = false; for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; @@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (do_pagesets) { cond_resched(); - changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); + if (decay_pcp_high(zone, this_cpu_ptr(pcp))) + changed = true; #ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this @@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } if (__this_cpu_dec_return(pcp->expire)) { - changes++; + changed = true; continue; } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); - changes++; + changed = true; } #endif } @@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } - changes += fold_diff(global_zone_diff, global_node_diff); - return changes; + if (fold_diff(global_zone_diff, global_node_diff)) + changed = true; + return changed; } /* From fc4b909c368f3a7b08c895dd5926476b58e85312 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:09 -0700 Subject: [PATCH 050/321] mm/page_alloc: batch page freeing in decay_pcp_high It is possible for pcp->count - pcp->high to exceed pcp->batch by a lot. When this happens, we should perform batching to ensure that free_pcppages_bulk isn't called with too many pages to free at once and starve out other threads that need the pcp or zone lock. Since we are still only freeing the difference between the initial pcp->count and pcp->high values, there should be no change to how many pages are freed. Link: https://lkml.kernel.org/r/20251014145011.3427205-3-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Suggested-by: Chris Mason Suggested-by: Andrew Morton Co-developed-by: Johannes Weiner Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f057ce5ea7da..52480f513ba2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2559,7 +2559,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { - int high_min, to_drain, batch; + int high_min, to_drain, to_drain_batched, batch; bool todo = false; high_min = READ_ONCE(pcp->high_min); @@ -2577,11 +2577,14 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) } to_drain = pcp->count - pcp->high; - if (to_drain > 0) { + while (to_drain > 0) { + to_drain_batched = min(to_drain, batch); spin_lock(&pcp->lock); - free_pcppages_bulk(zone, to_drain, pcp, 0); + free_pcppages_bulk(zone, to_drain_batched, pcp, 0); spin_unlock(&pcp->lock); todo = true; + + to_drain -= to_drain_batched; } return todo; From 91e691296646e68d4116b8303aa11d6de0376aef Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:10 -0700 Subject: [PATCH 051/321] mm/page_alloc: batch page freeing in free_frozen_page_commit Before returning, free_frozen_page_commit calls free_pcppages_bulk using nr_pcp_free to determine how many pages can appropritately be freed, based on the tunable parameters stored in pcp. While this number is an accurate representation of how many pages should be freed in total, it is not an appropriate number of pages to free at once using free_pcppages_bulk, since we have seen the value consistently go above 2000 in the Meta fleet on larger machines. As such, perform batched page freeing in free_pcppages_bulk by using pcp->batch. In order to ensure that other processes are not starved of the zone lock, free both the zone lock and pcp lock to yield to other threads. Note that because free_frozen_page_commit now performs a spinlock inside the function (and can fail), the function may now return with a freed pcp. To handle this, return true if the pcp is locked on exit and false otherwise. In addition, since free_frozen_page_commit must now be aware of what UP flags were stored at the time of the spin lock, and because we must be able to report new UP flags to the callers, add a new unsigned long* parameter UP_flags to keep track of this. The following are a few synthetic benchmarks, made on three machines. The first is a large machine with 754GiB memory and 316 processors. The second is a relatively smaller machine with 251GiB memory and 176 processors. The third and final is the smallest of the three, which has 62GiB memory and 36 processors. On all machines, I kick off a kernel build with -j$(nproc). Negative delta is better (faster compilation) Large machine (754GiB memory, 316 processors) make -j$(nproc) +------------+---------------+-----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+-----------+ | real | 0.8070 | - 1.4865 | | user | 0.2823 | + 0.4081 | | sys | 5.0267 | -11.8737 | +------------+---------------+-----------+ Medium machine (251GiB memory, 176 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.2806 | +0.0351 | | user | 0.0994 | +0.3170 | | sys | 0.6229 | -0.6277 | +------------+---------------+----------+ Small machine (62GiB memory, 36 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.1503 | -2.6585 | | user | 0.0431 | -2.2984 | | sys | 0.1870 | -3.2013 | +------------+---------------+----------+ Here, variation is the coefficient of variation, i.e. standard deviation / mean. [joshua.hahnjy@gmail.com: simplify checks] Link: https://lkml.kernel.org/r/20251014192827.851389-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251014145011.3427205-4-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Suggested-by: Chris Mason Co-developed-by: Johannes Weiner Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 65 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 52480f513ba2..0155a66d7367 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2818,12 +2818,22 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_frozen_page_commit(struct zone *zone, +/* + * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the + * pcp's watermarks below high. + * + * May return a freed pcp, if during page freeing the pcp spinlock cannot be + * reacquired. Return true if pcp is locked, false otherwise. + */ +static bool free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order, fpi_t fpi_flags) + unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags) { int high, batch; + int to_free, to_free_batched; int pindex; + int cpu = smp_processor_id(); + int ret = true; bool free_high = false; /* @@ -2861,15 +2871,46 @@ static void free_frozen_page_commit(struct zone *zone, * Do not attempt to take a zone lock. Let pcp->count get * over high mark temporarily. */ - return; + return true; } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count < high) - return; + return true; + + to_free = nr_pcp_free(pcp, batch, high, free_high); + while (to_free > 0 && pcp->count > 0) { + to_free_batched = min(to_free, batch); + free_pcppages_bulk(zone, to_free_batched, pcp, pindex); + to_free -= to_free_batched; + + if (to_free == 0 || pcp->count == 0) + break; + + pcp_spin_unlock(pcp); + pcp_trylock_finish(*UP_flags); + + pcp_trylock_prepare(*UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (!pcp) { + pcp_trylock_finish(*UP_flags); + ret = false; + break; + } + + /* + * Check if this thread has been migrated to a different CPU. + * If that is the case, give up and indicate that the pcp is + * returned in an unlocked state. + */ + if (smp_processor_id() != cpu) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(*UP_flags); + ret = false; + break; + } + } - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && zone_watermark_ok(zone, 0, high_wmark_pages(zone), ZONE_MOVABLE, 0)) { @@ -2887,6 +2928,7 @@ static void free_frozen_page_commit(struct zone *zone, next_memory_node(pgdat->node_id) < MAX_NUMNODES) atomic_set(&pgdat->kswapd_failures, 0); } + return ret; } /* @@ -2934,7 +2976,9 @@ static void __free_frozen_pages(struct page *page, unsigned int order, pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); + if (!free_frozen_page_commit(zone, pcp, page, migratetype, + order, fpi_flags, &UP_flags)) + return; pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, fpi_flags); @@ -3034,8 +3078,11 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_frozen_page_commit(zone, pcp, &folio->page, migratetype, - order, FPI_NONE); + if (!free_frozen_page_commit(zone, pcp, &folio->page, + migratetype, order, FPI_NONE, &UP_flags)) { + pcp = NULL; + locked_zone = NULL; + } } if (pcp) { From 0f21b911011f9fcdc8fab584d6bd5a284e2119eb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Oct 2025 19:50:38 +0200 Subject: [PATCH 052/321] mm/page_alloc: simplify and cleanup pcp locking The pcp locking relies on pcp_spin_trylock() which has to be used together with pcp_trylock_prepare()/pcp_trylock_finish() to work properly on !SMP !RT configs. This is tedious and error-prone. We can remove pcp_spin_lock() and underlying pcpu_spin_lock() because we don't use it. Afterwards pcp_spin_unlock() is only used together with pcp_spin_trylock(). Therefore we can add the UP_flags parameter to them both and handle pcp_trylock_prepare()/finish() within. Additionally for the configs where pcp_trylock_prepare()/finish() are no-op (SMP || RT) make them pass &UP_flags to a no-op inline function. This ensures typechecking and makes the local variable "used" so we can remove the __maybe_unused attributes. In my compile testing, bloat-o-meter reported no change on SMP config, so the compiler is capable of optimizing away the no-ops same as before, and we have simplified the code using pcp_spin_trylock(). Link: https://lkml.kernel.org/r/20251015-b4-pcp-lock-cleanup-v2-1-740d999595d5@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Joshua Hahn Reviewed-by: Suren Baghdasaryan Cc: Brendan Jackman Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 99 ++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 59 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0155a66d7367..fb91c566327c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -99,9 +99,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); /* * On SMP, spin_trylock is sufficient protection. * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + * Pass flags to a no-op inline function to typecheck and silence the unused + * variable warning. */ -#define pcp_trylock_prepare(flags) do { } while (0) -#define pcp_trylock_finish(flag) do { } while (0) +static inline void __pcp_trylock_noop(unsigned long *flags) { } +#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags)) +#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags)) #else /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ @@ -129,15 +132,6 @@ static DEFINE_MUTEX(pcp_batch_high_lock); * Generic helper to lookup and a per-cpu variable with an embedded spinlock. * Return value should be used with equivalent unlock helper. */ -#define pcpu_spin_lock(type, member, ptr) \ -({ \ - type *_ret; \ - pcpu_task_pin(); \ - _ret = this_cpu_ptr(ptr); \ - spin_lock(&_ret->member); \ - _ret; \ -}) - #define pcpu_spin_trylock(type, member, ptr) \ ({ \ type *_ret; \ @@ -157,14 +151,21 @@ static DEFINE_MUTEX(pcp_batch_high_lock); }) /* struct per_cpu_pages specific helpers. */ -#define pcp_spin_lock(ptr) \ - pcpu_spin_lock(struct per_cpu_pages, lock, ptr) +#define pcp_spin_trylock(ptr, UP_flags) \ +({ \ + struct per_cpu_pages *__ret; \ + pcp_trylock_prepare(UP_flags); \ + __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \ + if (!__ret) \ + pcp_trylock_finish(UP_flags); \ + __ret; \ +}) -#define pcp_spin_trylock(ptr) \ - pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) - -#define pcp_spin_unlock(ptr) \ - pcpu_spin_unlock(lock, ptr) +#define pcp_spin_unlock(ptr, UP_flags) \ +({ \ + pcpu_spin_unlock(lock, ptr); \ + pcp_trylock_finish(UP_flags); \ +}) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -2887,13 +2888,10 @@ static bool free_frozen_page_commit(struct zone *zone, if (to_free == 0 || pcp->count == 0) break; - pcp_spin_unlock(pcp); - pcp_trylock_finish(*UP_flags); + pcp_spin_unlock(pcp, *UP_flags); - pcp_trylock_prepare(*UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags); if (!pcp) { - pcp_trylock_finish(*UP_flags); ret = false; break; } @@ -2904,8 +2902,7 @@ static bool free_frozen_page_commit(struct zone *zone, * returned in an unlocked state. */ if (smp_processor_id() != cpu) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(*UP_flags); + pcp_spin_unlock(pcp, *UP_flags); ret = false; break; } @@ -2937,7 +2934,7 @@ static bool free_frozen_page_commit(struct zone *zone, static void __free_frozen_pages(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); @@ -2973,17 +2970,15 @@ static void __free_frozen_pages(struct page *page, unsigned int order, add_page_to_zone_llist(zone, page, order); return; } - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (pcp) { if (!free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags, &UP_flags)) return; - pcp_spin_unlock(pcp); + pcp_spin_unlock(pcp, UP_flags); } else { free_one_page(zone, page, pfn, order, fpi_flags); } - pcp_trylock_finish(UP_flags); } void free_frozen_pages(struct page *page, unsigned int order) @@ -2996,7 +2991,7 @@ void free_frozen_pages(struct page *page, unsigned int order) */ void free_unref_folios(struct folio_batch *folios) { - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int i, j; @@ -3039,8 +3034,7 @@ void free_unref_folios(struct folio_batch *folios) if (zone != locked_zone || is_migrate_isolate(migratetype)) { if (pcp) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); locked_zone = NULL; pcp = NULL; } @@ -3059,10 +3053,8 @@ void free_unref_folios(struct folio_batch *folios) * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (unlikely(!pcp)) { - pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, pfn, order, FPI_NONE); continue; @@ -3085,10 +3077,8 @@ void free_unref_folios(struct folio_batch *folios) } } - if (pcp) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); - } + if (pcp) + pcp_spin_unlock(pcp, UP_flags); folio_batch_reinit(folios); } @@ -3339,15 +3329,12 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); - if (!pcp) { - pcp_trylock_finish(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + if (!pcp) return NULL; - } /* * On allocation, reduce the number of pages that are batch freed. @@ -3357,8 +3344,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -5045,7 +5031,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array) { struct page *page; - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5139,10 +5125,9 @@ retry_this_zone: goto failed; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (!pcp) - goto failed_irq; + goto failed; /* Attempt the batch allocation */ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; @@ -5159,8 +5144,8 @@ retry_this_zone: if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock(pcp); - goto failed_irq; + pcp_spin_unlock(pcp, UP_flags); + goto failed; } break; } @@ -5171,8 +5156,7 @@ retry_this_zone: page_array[nr_populated++] = page; } - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); @@ -5180,9 +5164,6 @@ retry_this_zone: out: return nr_populated; -failed_irq: - pcp_trylock_finish(UP_flags); - failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); if (page) From 2f79ddb64b472e50482f6728fec3b44156e5d844 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Wed, 15 Oct 2025 17:38:50 +0800 Subject: [PATCH 053/321] tools/mm: use in page_owner_sort.c Use standard instead of manually defining bool, true and false. Link: https://lkml.kernel.org/r/20251015093851.109663-1-ye.liu@linux.dev Signed-off-by: Ye Liu Reviewed-by: Dev Jain Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 880e36df0c11..894a765652ac 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -23,9 +24,6 @@ #include #include -#define bool int -#define true 1 -#define false 0 #define TASK_COMM_LEN 16 struct block_list { From a059ad48b453ad9ebdefaba6248229e83e5ad4d9 Mon Sep 17 00:00:00 2001 From: wang lian Date: Wed, 15 Oct 2025 17:29:57 +0800 Subject: [PATCH 054/321] mm/khugepaged: fix comment for default scan sleep duration The comment for khugepaged_scan_sleep_millisecs incorrectly states the default scan period is 30 seconds. The actual default value in the code is 10000ms (10 seconds). This patch corrects the comment to match the code, preventing potential confusion. The incorrect comment has existed since the feature was first introduced. While at it, replace the magic value 512 by HPAGE_PMD_NR and use 'ptes'. Link: https://lkml.kernel.org/r/20251015092957.37432-1-lianux.mm@gmail.com Signed-off-by: wang lian Suggested-by: David Hildenbrand Reviewed-by: Dev Jain Reviewed-by: Wei Yang Reviewed-by: Lance Yang Acked-by: David Hildenbrand Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Acked-by: Nico Pache Cc: Andrea Arcangeli Cc: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Rik van Riel Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abe54f0043c7..1be61cd5440d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -67,7 +67,7 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*512 pte (or vmas) every 30 second */ +/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; From d929525c2e30abee621bf71f143ba6104c81ff2b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 16 Oct 2025 09:10:35 -0700 Subject: [PATCH 055/321] memcg: net: track network throttling due to memcg memory pressure The kernel can throttle network sockets if the memory cgroup associated with the corresponding socket is under memory pressure. The throttling actions include clamping the transmit window, failing to expand receive or send buffers, aggressively prune out-of-order receive queue, FIN deferred to a retransmitted packet and more. Let's add memcg metric to track such throttling actions. At the moment memcg memory pressure is defined through vmpressure and in future it may be defined using PSI or we may add more flexible way for the users to define memory pressure, maybe through ebpf. However the potential throttling actions will remain the same, so this newly introduced metric will continue to track throttling actions irrespective of how memcg memory pressure is defined. Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Daniel Sedlak Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Neal Cardwell Cc: Paolo Abeni Cc: Simon Horman Cc: Tejun Heo Cc: Willem de Bruijn Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ include/linux/memcontrol.h | 1 + include/net/sock.h | 6 +++++- kernel/cgroup/cgroup.c | 1 + mm/memcontrol.c | 3 +++ 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0e6c67ac585a..3345961c30ac 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1515,6 +1515,10 @@ The following nested keys are defined. oom_group_kill The number of times a group OOM has occurred. + sock_throttled + The number of times network sockets associated with + this cgroup are throttled. + memory.events.local Similar to memory.events but the fields in the file are local to the cgroup i.e. not hierarchical. The file modified event diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..5fe254813123 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; diff --git a/include/net/sock.h b/include/net/sock.h index 60bcb13f045c..ff7d49af1619 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2635,8 +2635,12 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) #endif /* CONFIG_MEMCG_V1 */ do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); return true; + } } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..8df671c59987 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4704,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ae5cbcaed75..976412c8196e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +EXPORT_SYMBOL(root_mem_cgroup); /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -4463,6 +4464,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) atomic_long_read(&events[MEMCG_OOM_KILL])); seq_printf(m, "oom_group_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); + seq_printf(m, "sock_throttled %lu\n", + atomic_long_read(&events[MEMCG_SOCK_THROTTLED])); } static int memory_events_show(struct seq_file *m, void *v) From 5bf65d4a8dbe354999596ef6d14bd70839573b16 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Thu, 16 Oct 2025 13:49:25 +0800 Subject: [PATCH 056/321] tools/mm/page_owner_sort: add help option support Add -h/--help option to display usage information and improve code style. Link: https://lkml.kernel.org/r/20251016054927.138510-1-ye.liu@linux.dev Signed-off-by: Ye Liu Cc: SeongJae Park Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 894a765652ac..14c67e9e84c4 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -667,14 +667,15 @@ int main(int argc, char **argv) { "pid", required_argument, NULL, 1 }, { "tgid", required_argument, NULL, 2 }, { "name", required_argument, NULL, 3 }, - { "cull", required_argument, NULL, 4 }, - { "sort", required_argument, NULL, 5 }, + { "cull", required_argument, NULL, 4 }, + { "sort", required_argument, NULL, 5 }, + { "help", no_argument, NULL, 'h' }, { 0, 0, 0, 0}, }; compare_flag = COMP_NO_FLAG; - while ((opt = getopt_long(argc, argv, "admnpstP", longopts, NULL)) != -1) + while ((opt = getopt_long(argc, argv, "admnpstPh", longopts, NULL)) != -1) switch (opt) { case 'a': compare_flag |= COMP_ALLOC; @@ -700,6 +701,9 @@ int main(int argc, char **argv) case 'n': compare_flag |= COMP_COMM; break; + case 'h': + usage(); + exit(0); case 1: filter = filter | FILTER_PID; fc.pids = parse_nums_list(optarg, &fc.pids_size); From b34619af9c3fa181624babcdd319d63f5f4d92e7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 17 Oct 2025 15:53:06 +0800 Subject: [PATCH 057/321] mm: vmscan: filter out the dirty file folios for node_reclaim() Patch series "optimize the logic for handling dirty file folios during reclaim", v2. Since we no longer attempt to write back filesystem folios during reclaim, some logic for handling dirty file folios in the reclaim process also needs to be updated. Please check the details in each patch. This patch (of 2): After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer attempt to write back filesystem folios in pageout(), and only tmpfs/shmem folios and anonymous swapcache folios can be written back. Therefore, we should also filter out the dirty filesystem folios for node_reclaim() to avoid unnecessary LRU scans. Link: https://lkml.kernel.org/r/cover.1760687075.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/c91f5ecc5152b647904c7503618a01885d913928.1760687075.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/vmscan.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index bba0d075b2bb..e53ac12cc802 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7623,9 +7623,11 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) else nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); - /* If we can't clean pages, remove dirty pages from consideration */ - if (!(node_reclaim_mode & RECLAIM_WRITE)) - delta += node_page_state(pgdat, NR_FILE_DIRTY); + /* + * Since we can't clean folios through reclaim, remove dirty file + * folios from consideration. + */ + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) From 2f05435df9320e70f7a98149eb4b043ff361a120 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 17 Oct 2025 15:53:07 +0800 Subject: [PATCH 058/321] mm: vmscan: simplify the logic for activating dirty file folios After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer attempt to write back filesystem folios through reclaim. However, in the shrink_folio_list() function, there still remains some logic related to writeback control of dirty file folios. The original logic was that, for direct reclaim, or when folio_test_reclaim() is false, or the PGDAT_DIRTY flag is not set, the dirty file folios would be directly activated to avoid being scanned again; otherwise, it will try to writeback the dirty file folios. However, since we can no longer perform writeback on dirty folios, the dirty file folios will still be activated. Additionally, under the original logic, if we continue to try writeback dirty file folios, we will also check the references flag, sc->may_writepage, and may_enter_fs(), which may result in dirty file folios being left in the inactive list. This is unreasonable. Even if these dirty folios are scanned again, we still cannot clean them. Therefore, the checks on these dirty file folios appear to be redundant and can be removed. Dirty file folios should be directly moved to the active list to avoid being scanned again. Since we set the PG_reclaim flag for the dirty folios, once the writeback is completed, they will be moved back to the tail of the inactive list to be retried for quick reclaim. Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ---- mm/vmscan.c | 25 +++---------------------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e53ac12cc802..ecc90517b791 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1409,21 +1409,7 @@ retry: mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { - /* - * Only kswapd can writeback filesystem folios - * to avoid risk of stack overflow. But avoid - * injecting inefficient single-folio I/O into - * flusher writeback as much as possible: only - * write folios when we've encountered many - * dirty folios, and when we've already scanned - * the rest of the LRU for clean folios and see - * the same dirty folios again (with the reclaim - * flag set). - */ - if (folio_is_file_lru(folio) && - (!current_is_kswapd() || - !folio_test_reclaim(folio) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() @@ -1432,7 +1418,8 @@ retry: */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); - folio_set_reclaim(folio); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); goto activate_locked; } @@ -6127,11 +6114,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty && - sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it @@ -6872,7 +6854,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } From d3946c5f4c1c5db63532eb433a55c7d881de1389 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:53 -0700 Subject: [PATCH 059/321] mm/damon: document damos_quota_goal->nid use case Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage". Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup per-NUMA node memory utilization. Expected use cases are cgroup level access-aware NUMA memory managements, such as memory tiering or proactive reclamation on cgroup-based multi-tenant NUMA systems. Background ========== The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly recommended way for modern DAMOS use cases. Using it, users can specify what system status they want to achieve with what access-aware system operations. For example, reclaim cold memory aiming for 0.5 percent of memory pressure (proactive reclaim), or migrate hot and cold memory between NUMA nodes having different speed (memory tiering). Then DAMOS automatically adjusts the aggressiveness of the system operation (e.g., increase/decrease reclaim target coldness threshold) based on current status of the system. The use case is limited by the supported system status metrics for specifying the target system status. Two new system metrics for per-node memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were recently added to extend the use cases for access-aware NUMA nodes management, such as memory tiering. Those are expected to be useful for not only memory tiering but also general access-aware inter-NUMA node page migration, though. Limitation ---------- The per-node memory usage based auto-tuning can be applied only system-wide. For cgroups-based multi-tenant systems, it could arguably harm the fairness. For example, a cgroup may use faster NUMA node memory more than other cgroup, depending on their access pattern. If the user of each cgroup are promised to get the same quality and amount of the system resource, this can arguably be an unfair situation. DAMOS supports cgroup level system operations via DAMOS filter. But the quota auto-tuning system is not aware of cgroups. New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage =================================================================== To overcome the limitation, introduce two new DAMOS quota auto-tuning goal metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP. Those can be thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that extended for cgroups. The two metrics specifies per-cgroup, per-node amount of used and unused memory in ratio to the total memory of the node. For example, let's assume a system has two NUMA nodes of size 100 GiB and 50 GiB. And two cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node 1, respectively, as illustrated by the below table. node-0 node-1 Total memory 100 GiB 50 GiB Cgroup A usage 40 GiB 20 GiB Cgroup B usage 60 GiB 10 GiB Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000 bp (60 percent), respectively. Those for the second node are, 20 GiB / 50 GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent), respectively. DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB = 6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB / 50 GiB = 8000 bp, respectively. DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp Using these, users can specify how much [un]used amount of memory for per-cgroup and per-node DAMOS should make as a result of the auto-tuning. Example Usecase: Cgroup Level Memory Tiering ============================================ Let's suppose a typical and simple tiered memory system. The system equips two NUMA nodes. The first node (node 0) is CPU-attached and fast. The second node (node 1) is CPU-unattached and slow. It runs two cgroups that desire to use about 30 percent and 70 percent of the faster node as much as possible for their hot data, respectively. Then, the user can implement DAMOS-based memory tiering for the system using the DAMON user-space tool (damo), like below. # ./damo start \ `# kdamond for node 1 (slow)` \ --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \ `# promotion scheme for cgroup a` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \ \ `# promotion scheme for cgroup b` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/b \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \ \ `# kdamond for node 0 (fast)` \ --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \ `# demotion scheme for cgroup a` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 70.5% 0 \ \ `# demotion scheme for cgroup b` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 30.5% 0 \ \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 With the command, the user-space tool will ask DAMON to spawn two kernel threads, each for monitoring accesses to node 1 (slow) and node 0 (fast), respectively. It installs two DAMOS schemes on each thread. Let's call them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup a/b" in the order. The promotion schemes are installed on the DAMON thread for node 1 (slow), and demotion schemes are installed on the DAMON thread for node 0 (fast). Cgroup Level Hot Pages Migration (Promotion) -------------------------------------------- Promotion schemes will find memory regions on node 1 (slow), that some access was detected. The schemes will then migrate the found memory to node 0 (fast), hottest pages first. For accurate and effective migration, these schemes use two page level filters. First, the migration will be filtered for only cgroup A and cgroup B. That is, "promotion scheme for cgroup B" will not do the migration if the page is for cgroup A. Secondly, the schemes will ignore pages that having their page table's Accessed bits unset. The per-page Accessed bit check logic will also unset the bit if it was set, for the next check. For controlled amounts of system resource consumption and aiming on the target memory usage, the schemes use quotas setup. The migration is limited to be done only up to 200 MiB per second, to limit the peak system resource usage. And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for 29.7% and 69.7% of node 0 (fast), respectively. The target value is lower than the high level goal (30% and 70% system memory), to give headroom on node 0 (fast). DAMOS will adjust the speed of the pages migration based on the target and current per-cgroup node 0 memory usage. For example, if cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second. If cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages migration from node 1 to node 0 will be slowed and eventually stopped. Cgroup Level Cold Pages Migration (Demotion) -------------------------------------------- Demotion schemes are similar to promotion schemes, but differ in filtering setup and quota tuning setup. Those filter out pages having their page table Accessed bits set. And set 70.5% and 30.5% of node 0 memory free rate for the cgroup A and B, respectively. Hence, if promotion schemes or something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0, demotion schemes will start migrating cold pages of appropriate cgroups in node 0 to node 1, under the 200 MiB per second speed cap, while adjusting the speed based on how much more than wanted memory is being used. The quota target values are set to overlap with promotion targets, to keep a minimum level of page exchanges between the nodes. This is to avoid a case that the target memory utilization is met, and then access pattern changes (pages in node 1 become hotter than pages in node 0) while the memory utilization is unchanged. Without the overlap, neither promotion of hotter pages in node 1, nor demotion of colder pages in node 0 will happen since both goals are met. As a result, the faster and slower node will unexpectedly serve cold and hot data. Test: Per-cgroup Memory Tiering =============================== I ran a simplified cgroup level memory tiering using the feature, and confirmed it works as intended. Setup ----- I configured a QEMU virtual machine representing a simplified version of the system that described on the above cgroup level memory tiering example use case. The system equips 40 CPU cores and two NUMA nodes each having 30 GiB physical memory. The first node (node 0) represents the faster NUMA node, and the second node (node 1) represents the slower NUMA node. In specific, below qemu command line options are used. [...] -object memory-backend-ram,size=30G,id=m0 \ -object memory-backend-ram,size=30G,id=m1 \ -numa node,cpus=0-39,memdev=m0 \ -numa node,memdev=m1 \ [...] I booted the virtual machine with a kernel that this patch series is applied. On the virtual machine, I created two cgroups, namely workload_a and workload_b. And ran a test program in each cgroup, resulting in one process per cgroup. The test program allocates 10 GiB memory and evenly split it into 10 regions. After the allocation, it repeatedly access the first region for one minute, than the second one for one minute, and so on. After the one minute repeated access for the 10-th region is done, it repeats the access from the first region. So the process has 10 GiB of data in total, but only 1 GiB of it is hot at a given moment, and the hot data is gradually changed. While the processes are running, run DAMON for a simple access-aware memory tiering using below script. It migrates hot and cold data of the cgroups into node 0 and node 1, aiming the first and the second cgroups (workload_a and workload_b, respectively) utilizing about 9.7 percent and 19.7 percent of node 0, respectively. Note that this setup is a simplified version of the above example use case, for ease of test. Also note that we assigned 30 GiB physical memory to node 0, but DAMON in this setup works for only 27 GiB of the memory. It is due to an internal implementation detail of DAMON user-space tool that not really important for this test. #!/bin/bash damo start \ --numa_node 1 \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \ --numa_node 0 \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 After starting DAMON, the pages continuously be migrated across nodes. A few minutes later, the memory usage of the cgroups converges into the aimed amounts, and keeps the level, as expected. To confirm the status is kept in the target level as expected, I collected the memory usage stat of the cgroups using memory.numa_stat file, after the stats are converged. I repeat the stat collection 42 times with 5 seconds delay between each of the collections. The results are as below: node0_memory_usage average stdev workload_a 2.79GiB 522.06MiB workload_b 5.15GiB 739.10MiB The average values are quite close to the targeted values: 27 GiB * 9.7% = 2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB. A level of variances are expected, given the overlap of the promotion/demotion targets, and dynamic data access pattern of the workloads. Give that, the measured variances are at a reasonable level. Patches Sequence ================ The first patch (patch 1) updates the kernel-doc comment of damos_quota_goal struct to clarify usage of optional fields of the struct, since later patches will add such optional fields. Following four patches (patches 2-5) implement a new DAMOS quota goal metric for per-cgroup per-node memory usage. Those extends the core layer interface for the new metric (patch 2), implement the metric value calculation on the core layer (patch 3), add DAMON sysfs interface file for the target cgroup specification (patch 4), and implement support of the new metric on DAMON sysfs interface (patch 5). Next two patches implment the second new DAMOS quota goal metric for per-cgroup per-node free (or, unused) memory. Those implement it in the core layer (patch 6) and DAMON sysfs interface (patch 7), extending the existing implementation for memory usage metric. Final three patches update the design (patch 8), the usage (patch 9), and the ABI (patch 10) documents for the changes that are introduced by this patch series. This patch (of 10): damos_quota_goal kerneldoc comment is not explaining when @metric is used. Update the comment for that. Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..dc9c310e0e75 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -176,6 +176,9 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; From 6a18bbe48361acad1eae8d86aa47d353b1cfe619 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:54 -0700 Subject: [PATCH 060/321] mm/damon: add DAMOS quota goal type for per-memcg per-node memory usage Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node memory usage. For specifying the cgroup of the interest, add a field, namely memcg_id, to damos_quota_goal struct. Note that this commit is only implementing the interface. The handling of the interface (the metric value calculation) will be implemented in the following commit. Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index dc9c310e0e75..0d63ceb7e6ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -147,6 +147,7 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +157,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +168,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +182,9 @@ enum damos_quota_goal_metric { * * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents + * the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -187,7 +193,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; From b74a120bcf50787e5b9a2c3dcff999f9836ce1db Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:55 -0700 Subject: [PATCH 061/321] mm/damon/core: implement DAMOS_QUOTA_NODE_MEMCG_USED_BP Implement the handling of the new DAMOS quota goal metric for per-memcg per-node memory usage, namely DAMOS_QUOTA_NODE_MEMCG_USED_BP. The metric value is calculated as the sum of active/inactive anon/file pages of the given cgroup for a given NUMA node. Link: https://lkml.kernel.org/r/20251017212706.183502-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 109b050c795a..8aa8d269df90 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -788,6 +789,10 @@ static void damos_commit_quota_goal_union( case DAMOS_QUOTA_NODE_MEM_FREE_BP: dst->nid = src->nid; break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + dst->nid = src->nid; + dst->memcg_id = src->memcg_id; + break; default: break; } @@ -2035,12 +2040,46 @@ static __kernel_ulong_t damos_get_node_mem_bp( numerator = i.freeram; return numerator * 10000 / i.totalram; } + +static unsigned long damos_get_node_memcg_used_bp( + struct damos_quota_goal *goal) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + unsigned long used_pages; + struct sysinfo i; + + rcu_read_lock(); + memcg = mem_cgroup_from_id(goal->memcg_id); + rcu_read_unlock(); + if (!memcg) { + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + return 10000; + } + mem_cgroup_flush_stats(memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); + used_pages = lruvec_page_state(lruvec, NR_ACTIVE_ANON); + used_pages += lruvec_page_state(lruvec, NR_INACTIVE_ANON); + used_pages += lruvec_page_state(lruvec, NR_ACTIVE_FILE); + used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); + + si_meminfo_node(&i, goal->nid); + return used_pages * 10000 / i.totalram; +} #else static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { return 0; } + +static unsigned long damos_get_node_memcg_used_bp( + struct damos_quota_goal *goal) +{ + return 0; +} #endif @@ -2061,6 +2100,9 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) case DAMOS_QUOTA_NODE_MEM_FREE_BP: goal->current_value = damos_get_node_mem_bp(goal); break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + goal->current_value = damos_get_node_memcg_used_bp(goal); + break; default: break; } From c41e253a411eb73a5ac651c14f40c2ea2f274ebd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:56 -0700 Subject: [PATCH 062/321] mm/damon/sysfs-schemes: implement path file under quota goal directory Add a DAMOS sysfs file for specifying the cgroup of the interest for DAMOS_QUOTA_NODE_MEMCG_USED_BP. Link: https://lkml.kernel.org/r/20251017212706.183502-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6536f16006c9..2c440a2b80e6 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -999,6 +999,7 @@ struct damos_sysfs_quota_goal { unsigned long target_value; unsigned long current_value; int nid; + char *path; }; static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) @@ -1128,10 +1129,39 @@ static ssize_t nid_store(struct kobject *kobj, return err ? err : count; } +static ssize_t path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + + return sysfs_emit(buf, "%s\n", goal->path ? goal->path : ""); +} + +static ssize_t path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + char *path = kmalloc_array(size_add(count, 1), sizeof(*path), + GFP_KERNEL); + + if (!path) + return -ENOMEM; + + strscpy(path, buf, count + 1); + kfree(goal->path); + goal->path = path; + return count; +} + static void damos_sysfs_quota_goal_release(struct kobject *kobj) { - /* or, notify this release to the feed callback */ - kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj)); + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + + kfree(goal->path); + kfree(goal); } static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr = @@ -1146,11 +1176,15 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = static struct kobj_attribute damos_sysfs_quota_goal_nid_attr = __ATTR_RW_MODE(nid, 0600); +static struct kobj_attribute damos_sysfs_quota_goal_path_attr = + __ATTR_RW_MODE(path, 0600); + static struct attribute *damos_sysfs_quota_goal_attrs[] = { &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, &damos_sysfs_quota_goal_nid_attr.attr, + &damos_sysfs_quota_goal_path_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damos_sysfs_quota_goal); From a1d1df78acb3b038d72f97a66b8f5e01af021b7c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:57 -0700 Subject: [PATCH 063/321] mm/damon/sysfs-schemes: support DAMOS_QUOTA_NODE_MEMCG_USED_BP Add support of DAMOS_QUOTA_NODE_MEMCG_USED_BP. For this, extend quota goal metric inputs for the new metric, and update DAMOS core layer request construction logic to set the target cgroup, which is specified by the user, via the 'path' file. Link: https://lkml.kernel.org/r/20251017212706.183502-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 2c440a2b80e6..c679e62fd4b9 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1030,6 +1030,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP, .name = "node_mem_free_bp", }, + { + .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP, + .name = "node_memcg_used_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -2526,7 +2530,7 @@ static int damos_sysfs_add_quota_score( struct damos_quota *quota) { struct damos_quota_goal *goal; - int i; + int i, err; for (i = 0; i < sysfs_goals->nr; i++) { struct damos_sysfs_quota_goal *sysfs_goal = @@ -2547,6 +2551,15 @@ static int damos_sysfs_add_quota_score( case DAMOS_QUOTA_NODE_MEM_FREE_BP: goal->nid = sysfs_goal->nid; break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + err = damon_sysfs_memcg_path_to_id( + sysfs_goal->path, &goal->memcg_id); + if (err) { + damos_destroy_quota_goal(goal); + return err; + } + goal->nid = sysfs_goal->nid; + break; default: break; } From 98fdce76fb7ed7070df21afbee46a4b36cb6a7c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:58 -0700 Subject: [PATCH 064/321] mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa free memory Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory portion. The value of the metric is implemented as the entire memory of the given NUMA node subtracted by the given cgroup's usage. So from a perspective, "unused" could be a better term than "free". But arguably it is not very clear what is better, so use the term "free". Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- mm/damon/core.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 0d63ceb7e6ef..0edf41d36ea1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -148,6 +148,7 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -158,6 +159,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -183,8 +185,8 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. * - * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents - * the node id and the cgroup to account the used memory for. + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; diff --git a/mm/damon/core.c b/mm/damon/core.c index 8aa8d269df90..a9c11d2d37b0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -790,6 +790,7 @@ static void damos_commit_quota_goal_union( dst->nid = src->nid; break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: dst->nid = src->nid; dst->memcg_id = src->memcg_id; break; @@ -2046,7 +2047,7 @@ static unsigned long damos_get_node_memcg_used_bp( { struct mem_cgroup *memcg; struct lruvec *lruvec; - unsigned long used_pages; + unsigned long used_pages, numerator; struct sysinfo i; rcu_read_lock(); @@ -2066,7 +2067,11 @@ static unsigned long damos_get_node_memcg_used_bp( used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); si_meminfo_node(&i, goal->nid); - return used_pages * 10000 / i.totalram; + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + numerator = used_pages; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + numerator = i.totalram - used_pages; + return numerator * 10000 / i.totalram; } #else static __kernel_ulong_t damos_get_node_mem_bp( @@ -2101,6 +2106,7 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_node_mem_bp(goal); break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; default: From c2fbf2da4cd94035cd885af5d91eed827509e421 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:59 -0700 Subject: [PATCH 065/321] mm/damon/sysfs-schemes: support DAMOS_QUOTA_NODE_MEMCG_FREE_BP Extend DAMON sysfs to support DAMOS_QUOTA_NODE_MEMCG_FREE_BP. Link: https://lkml.kernel.org/r/20251017212706.183502-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index c679e62fd4b9..c98cf4bd2fbb 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1034,6 +1034,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP, .name = "node_memcg_used_bp", }, + { + .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + .name = "node_memcg_free_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -2552,6 +2556,7 @@ static int damos_sysfs_add_quota_score( goal->nid = sysfs_goal->nid; break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: err = damon_sysfs_memcg_path_to_id( sysfs_goal->path, &goal->memcg_id); if (err) { From 4cc00d41c6c9f8dfb8b6db831e9fca77582112b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:27:00 -0700 Subject: [PATCH 066/321] Docs/mm/damon/design: document DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP Update design doc for the newly added two DAMOS quota auto-tuning target goal metrics, DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP. Link: https://lkml.kernel.org/r/20251017212706.183502-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 80354f4f42ba..b54925ea78e9 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -564,9 +564,9 @@ aggressiveness (the quota) of the corresponding scheme. For example, if DAMOS is under achieving the goal, DAMOS automatically increases the quota. If DAMOS is over achieving the goal, it decreases the quota. -The goal can be specified with four parameters, namely ``target_metric``, -``target_value``, ``current_value`` and ``nid``. The auto-tuning mechanism -tries to make ``current_value`` of ``target_metric`` be same to +The goal can be specified with five parameters, namely ``target_metric``, +``target_value``, ``current_value``, ``nid`` and ``path``. The auto-tuning +mechanism tries to make ``current_value`` of ``target_metric`` be same to ``target_value``. - ``user_input``: User-provided value. Users could use any metric that they @@ -581,9 +581,18 @@ tries to make ``current_value`` of ``target_metric`` be same to set by users at the initial time. In other words, DAMOS does self-feedback. - ``node_mem_used_bp``: Specific NUMA node's used memory ratio in bp (1/10,000). - ``node_mem_free_bp``: Specific NUMA node's free memory ratio in bp (1/10,000). +- ``node_memcg_used_bp``: Specific cgroup's node used memory ratio for a + specific NUMA node, in bp (1/10,000). +- ``node_memcg_free_bp``: Specific cgroup's node unused memory ratio for a + specific NUMA node, in bp (1/10,000). -``nid`` is optionally required for only ``node_mem_used_bp`` and -``node_mem_free_bp`` to point the specific NUMA node. +``nid`` is optionally required for only ``node_mem_used_bp``, +``node_mem_free_bp``, ``node_memcg_used_bp`` and ``node_memcg_free_bp`` to +point the specific NUMA node. + +``path`` is optionally required for only ``node_memcg_used_bp`` and +``node_memcg_free_bp`` to point the path to the cgroup. The value should be +the path of the memory cgroup from the cgroups mount point. To know how user-space can set the tuning goal metric, the target value, and/or the current value via :ref:`DAMON sysfs interface `, refer to From 87b83515801fbc5fde52b5d97921e434bcc6c889 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:27:01 -0700 Subject: [PATCH 067/321] Docs/admin-guide/mm/damon/usage: document DAMOS quota goal path file A new DAMON sysfs interface file, namely 'path' has been added under DAMOS quota goal directory, for specifying the cgroup for DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP metrics. Document it on the usage document. Link: https://lkml.kernel.org/r/20251017212706.183502-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index eae534bc1bee..98958975604d 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -81,7 +81,7 @@ comma (","). │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms,effective_bytes │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ │ :ref:`goals `/nr_goals - │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid + │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid,path │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low │ │ │ │ │ │ │ :ref:`{core_,ops_,}filters `/nr_filters │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max @@ -402,9 +402,9 @@ number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each goal and current achievement. Among the multiple feedback, the best one is used. -Each goal directory contains four files, namely ``target_metric``, -``target_value``, ``current_value`` and ``nid``. Users can set and get the -four parameters for the quota auto-tuning goals that specified on the +Each goal directory contains five files, namely ``target_metric``, +``target_value``, ``current_value`` ``nid`` and ``path``. Users can set and +get the five parameters for the quota auto-tuning goals that specified on the :ref:`design doc ` by writing to and reading from each of the files. Note that users should further write ``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond From 40d923acfa83d514718d1aee96b2af91e4e1fff2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:27:02 -0700 Subject: [PATCH 068/321] Docs/ABI/damon: document DAMOS quota goal path file A DAMON sysfs interface file for DAMOS quota goal's optional path argument has been added. Document it on the ABI doc. Link: https://lkml.kernel.org/r/20251017212706.183502-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index b6b71db36ca7..dce6c2cda4e8 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -303,6 +303,12 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the nid parameter of the goal. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goals//path +Date: Oct 2025 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the path + parameter of the goal. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/sz_permil Date: Mar 2022 Contact: SeongJae Park From 074f027d15c10cb376b3ad88405b8e512fa5b3a8 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 20 Oct 2025 23:11:11 +0800 Subject: [PATCH 069/321] mm/khugepaged: guard is_zero_pfn() calls with pte_present() A non-present entry, like a swap PTE, contains completely different data (swap type and offset). pte_pfn() doesn't know this, so if we feed it a non-present entry, it will spit out a junk PFN. What if that junk PFN happens to match the zeropage's PFN by sheer chance? While really unlikely, this would be really bad if it did. So, let's fix this potential bug by ensuring all calls to is_zero_pfn() in khugepaged.c are properly guarded by a pte_present() check. Link: https://lkml.kernel.org/r/20251020151111.53561-1-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: Lorenzo Stoakes Reviewed-by: Nico Pache Reviewed-by: Dev Jain Reviewed-by: Baolin Wang Reviewed-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Ryan Roberts Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1be61cd5440d..68e487d53772 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -337,6 +337,13 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ +static bool pte_none_or_zero(pte_t pte) +{ + if (pte_none(pte)) + return true; + return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); +} + int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { @@ -518,6 +525,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, if (pte_none(pteval)) continue; + VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; @@ -548,8 +556,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { + if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || @@ -690,17 +697,17 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, address += nr_ptes * PAGE_SIZE) { nr_ptes = 1; pteval = ptep_get(_pte); - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - ksm_might_unmap_zero_page(vma->vm_mm, pteval); - } + if (pte_none(pteval)) + continue; + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); @@ -794,7 +801,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } @@ -1301,7 +1308,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || From e859a224fad65cb4848fe202aea9896a14fdb7f4 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Mon, 20 Oct 2025 21:01:24 +0800 Subject: [PATCH 070/321] mm/damon: add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() Patch series "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM", v2. In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. This patch (of 2): In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 6 ++++-- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- mm/damon/stat.c | 3 ++- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 0edf41d36ea1..9ee026c2db53 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -961,7 +961,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index a9c11d2d37b0..82546d138a5a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2818,6 +2818,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @min_sz_region: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2828,7 +2829,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * Return: 0 on success, negative error code otherwise. */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end) + unsigned long *start, unsigned long *end, + unsigned long min_sz_region) { struct damon_addr_range addr_range; @@ -2841,7 +2843,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); + return damon_set_regions(t, &addr_range, 1, min_sz_region); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 42b9a656f9de..49b4bc294f4e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7ba3d0f9a19a..e30811cafe90 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + DAMON_MIN_REGION); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index bf8626859902..ed8e3629d31a 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -188,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + if (damon_set_region_biggest_system_ram_default(target, &start, &end, + ctx->min_sz_region)) goto free_out; return ctx; free_out: From dfc02531f413bf18f5e0ac79a52d2af6f69e99c3 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Mon, 20 Oct 2025 21:01:25 +0800 Subject: [PATCH 071/321] mm/damon/reclaim: use min_sz_region for core address alignment when setting regions When setting regions in DAMON_RECLAIM, DAMON_MIN_REGION will be applied as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_RECLAIM to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, use min_sz_region for core address alignment when setting regions. Link: https://lkml.kernel.org/r/20251020130125.2875164-3-yanquanmin1@huawei.com Fixes: 7db551fcfb2a ("mm/damon/reclaim: support addr_unit for DAMON_RECLAIM") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e30811cafe90..36a582e09eae 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -251,7 +251,7 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, - DAMON_MIN_REGION); + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); From ab3c8e7b8687a26eacb95ba343de5dad4fb95880 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:18 +0100 Subject: [PATCH 072/321] mm/shmem: update shmem to use mmap_prepare Patch series "expand mmap_prepare functionality, port more users", v5. Since commit c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file callback"), The f_op->mmap hook has been deprecated in favour of f_op->mmap_prepare. This was introduced in order to make it possible for us to eventually eliminate the f_op->mmap hook which is highly problematic as it allows drivers and filesystems raw access to a VMA which is not yet correctly initialised. This hook also introduced complexity for the memory mapping operation, as we must correctly unwind what we do should an error arises. Overall this interface being so open has caused significant problems for us, including security issues, it is important for us to simply eliminate this as a source of problems. Therefore this series continues what was established by extending the functionality further to permit more drivers and filesystems to use mmap_prepare. We start by udpating some existing users who can use the mmap_prepare functionality as-is. We then introduce the concept of an mmap 'action', which a user, on mmap_prepare, can request to be performed upon the VMA: * Nothing - default, we're done * Remap PFN - perform PFN remap with specified parameters * I/O remap PFN - perform I/O PFN remap with specified parameters By setting the action in mmap_prepare, this allows us to dynamically decide what to do next, so if a driver/filesystem needs to determine whether to e.g. remap or use a mixed map, it can do so then change which is done. This significantly expands the capabilities of the mmap_prepare hook, while maintaining as much control as possible in the mm logic. We split [io_]remap_pfn_range*() functions which allow for PFN remap (a typical mapping prepopulation operation) split between a prepare/complete step, as well as io_mremap_pfn_range_prepare, complete for a similar purpose. From there we update various mm-adjacent logic to use this functionality as a first set of changes. We also add success and error hooks for post-action processing for e.g. output debug log on success and filtering error codes. This patch (of 15): This simply assigns the vm_ops so is easily updated - do so. Link: https://lkml.kernel.org/r/cover.1760959441.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/7b93b1e89028e39507dac5ca01991e1374d5bbe8.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/shmem.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0eecb486a0cb..8b9fcdd144c8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2925,16 +2925,17 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +static int shmem_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) - vma->vm_ops = &shmem_vm_ops; + desc->vm_ops = &shmem_vm_ops; else - vma->vm_ops = &shmem_anon_vm_ops; + desc->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -5204,7 +5205,7 @@ static const struct address_space_operations shmem_aops = { }; static const struct file_operations shmem_file_operations = { - .mmap = shmem_mmap, + .mmap_prepare = shmem_mmap_prepare, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS From 8e18a7f43557f5caea18dfd82a24b6839ab0a956 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:19 +0100 Subject: [PATCH 073/321] device/dax: update devdax to use mmap_prepare The devdax driver does nothing special in its f_op->mmap hook, so straightforwardly update it to use the mmap_prepare hook instead. Link: https://lkml.kernel.org/r/1e8665d052ac8cf2f7ff92b6c7862614f7fd306c.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Jason Gunthorpe Acked-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/dax/device.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 7f1ed0db8337..22999a402e02 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -13,8 +13,9 @@ #include "dax-private.h" #include "bus.h" -static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, - const char *func) +static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, + unsigned long start, unsigned long end, struct file *file, + const char *func) { struct device *dev = &dev_dax->dev; unsigned long mask; @@ -23,7 +24,7 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -ENXIO; /* prevent private mappings from being established */ - if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); @@ -31,15 +32,15 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, } mask = dev_dax->align - 1; - if (vma->vm_start & mask || vma->vm_end & mask) { + if (start & mask || end & mask) { dev_info_ratelimited(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", - current->comm, func, vma->vm_start, vma->vm_end, + current->comm, func, start, end, mask); return -EINVAL; } - if (!vma_is_dax(vma)) { + if (!file_is_dax(file)) { dev_info_ratelimited(dev, "%s: %s: fail, vma is not DAX capable\n", current->comm, func); @@ -49,6 +50,13 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return 0; } +static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, + const char *func) +{ + return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end, + vma->vm_file, func); +} + /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) @@ -285,8 +293,9 @@ static const struct vm_operations_struct dax_vm_ops = { .pagesize = dev_dax_pagesize, }; -static int dax_mmap(struct file *filp, struct vm_area_struct *vma) +static int dax_mmap_prepare(struct vm_area_desc *desc) { + struct file *filp = desc->file; struct dev_dax *dev_dax = filp->private_data; int rc, id; @@ -297,13 +306,14 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) * fault time. */ id = dax_read_lock(); - rc = check_vma(dev_dax, vma, __func__); + rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp, + __func__); dax_read_unlock(id); if (rc) return rc; - vma->vm_ops = &dax_vm_ops; - vm_flags_set(vma, VM_HUGEPAGE); + desc->vm_ops = &dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; return 0; } @@ -376,7 +386,7 @@ static const struct file_operations dax_fops = { .open = dax_open, .release = dax_release, .get_unmapped_area = dax_get_unmapped_area, - .mmap = dax_mmap, + .mmap_prepare = dax_mmap_prepare, .fop_flags = FOP_MMAP_SYNC, }; From cf1d98f44d056d7114554beb78665c20d1ed244a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:20 +0100 Subject: [PATCH 074/321] mm/vma: remove unused function, make internal functions static unlink_file_vma() is not used by anything, so remove it. vma_link() and vma_link_file() are only used within mm/vma.c, so make them static. Link: https://lkml.kernel.org/r/f2ab9ea051225a02e6d1d45a7608f4e149220117.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vma.c | 21 ++------------------- mm/vma.h | 6 ------ 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 9127eaeea93f..004958a085cb 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1754,24 +1754,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) unlink_file_vma_batch_process(vb); } -/* - * Unlink a file-based vm structure from its interval tree, to hide - * vma from rmap and vmtruncate before freeing its page tables. - */ -void unlink_file_vma(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - - if (file) { - struct address_space *mapping = file->f_mapping; - - i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, mapping); - i_mmap_unlock_write(mapping); - } -} - -void vma_link_file(struct vm_area_struct *vma) +static void vma_link_file(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct address_space *mapping; @@ -1784,7 +1767,7 @@ void vma_link_file(struct vm_area_struct *vma) } } -int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); diff --git a/mm/vma.h b/mm/vma.h index 9183fe549009..e912d42c428a 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -312,12 +312,6 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); -void unlink_file_vma(struct vm_area_struct *vma); - -void vma_link_file(struct vm_area_struct *vma); - -int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); - struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); From 54c58a2f5fa191839cf192fa4ebab39395272a3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:21 +0100 Subject: [PATCH 075/321] mm: add vma_desc_size(), vma_desc_pages() helpers It's useful to be able to determine the size of a VMA descriptor range used on f_op->mmap_prepare, expressed both in bytes and pages, so add helpers for both and update code that could make use of it to do so. Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/ntfs3/file.c | 2 +- include/linux/mm.h | 10 ++++++++++ mm/secretmem.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4c90ec2fa2ea..2f344e1ed756 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -332,7 +332,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) if (rw) { u64 to = min_t(loff_t, i_size_read(inode), - from + desc->end - desc->start); + from + vma_desc_size(desc)); if (is_sparsed(ni)) { /* Allocate clusters for rw map. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..5752b0c516f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3583,6 +3583,16 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/mm/secretmem.c b/mm/secretmem.c index 9b0f5d9ec6f4..37f6d1097853 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file) static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - const unsigned long len = desc->end - desc->start; + const unsigned long len = vma_desc_size(desc); if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; From 651fdda8406d42a5004c5c79f269540a85d6a1ab Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:22 +0100 Subject: [PATCH 076/321] relay: update relay to use mmap_prepare It is relatively trivial to update this code to use the f_op->mmap_prepare hook in favour of the deprecated f_op->mmap hook, so do so. Link: https://lkml.kernel.org/r/7c9e82cdddf8b573ea3edb8cdb697363e3ccb5d7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/relay.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 8d915fe98198..e36f6b926f7f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -72,17 +72,18 @@ static void relay_free_page_array(struct page **array) } /** - * relay_mmap_buf: - mmap channel buffer to process address space - * @buf: relay channel buffer - * @vma: vm_area_struct describing memory to be mapped + * relay_mmap_prepare_buf: - mmap channel buffer to process address space + * @buf: the relay channel buffer + * @desc: describing what to map * * Returns 0 if ok, negative on error * * Caller should already have grabbed mmap_lock. */ -static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +static int relay_mmap_prepare_buf(struct rchan_buf *buf, + struct vm_area_desc *desc) { - unsigned long length = vma->vm_end - vma->vm_start; + unsigned long length = vma_desc_size(desc); if (!buf) return -EBADF; @@ -90,9 +91,9 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) if (length != (unsigned long)buf->chan->alloc_size) return -EINVAL; - vma->vm_ops = &relay_file_mmap_ops; - vm_flags_set(vma, VM_DONTEXPAND); - vma->vm_private_data = buf; + desc->vm_ops = &relay_file_mmap_ops; + desc->vm_flags |= VM_DONTEXPAND; + desc->private_data = buf; return 0; } @@ -749,16 +750,16 @@ static int relay_file_open(struct inode *inode, struct file *filp) } /** - * relay_file_mmap - mmap file op for relay files - * @filp: the file - * @vma: the vma describing what to map + * relay_file_mmap_prepare - mmap file op for relay files + * @desc: describing what to map * - * Calls upon relay_mmap_buf() to map the file into user space. + * Calls upon relay_mmap_prepare_buf() to map the file into user space. */ -static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) +static int relay_file_mmap_prepare(struct vm_area_desc *desc) { - struct rchan_buf *buf = filp->private_data; - return relay_mmap_buf(buf, vma); + struct rchan_buf *buf = desc->file->private_data; + + return relay_mmap_prepare_buf(buf, desc); } /** @@ -1006,7 +1007,7 @@ static ssize_t relay_file_read(struct file *filp, const struct file_operations relay_file_operations = { .open = relay_file_open, .poll = relay_file_poll, - .mmap = relay_file_mmap, + .mmap_prepare = relay_file_mmap_prepare, .read = relay_file_read, .release = relay_file_release, }; From 2bcd9207dedc585bae33ad7b337f2232a8a11da8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:23 +0100 Subject: [PATCH 077/321] mm/vma: rename __mmap_prepare() function to avoid confusion Now we have the f_op->mmap_prepare() hook, having a static function called __mmap_prepare() that has nothing to do with it is confusing, so rename the function to __mmap_setup(). Link: https://lkml.kernel.org/r/d25a22c60ca0f04091697ef9cda0d72ce0cf8af3.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 004958a085cb..eb2f711c03a1 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2312,7 +2312,7 @@ static void update_ksm_flags(struct mmap_state *map) } /* - * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be + * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * @@ -2321,7 +2321,7 @@ static void update_ksm_flags(struct mmap_state *map) * * Returns: 0 on success, error code otherwise. */ -static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) +static int __mmap_setup(struct mmap_state *map, struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; @@ -2632,7 +2632,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, map.check_ksm_early = can_set_ksm_flags_early(&map); - error = __mmap_prepare(&map, uf); + error = __mmap_setup(&map, uf); if (!error && have_mmap_prepare) error = call_mmap_prepare(&map); if (error) @@ -2662,7 +2662,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, return addr; - /* Accounting was done by __mmap_prepare(). */ + /* Accounting was done by __mmap_setup(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); From 51e38e7d40d617965504f4dcba569ecf9302f245 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:24 +0100 Subject: [PATCH 078/321] mm: add remap_pfn_range_prepare(), remap_pfn_range_complete() We need the ability to split PFN remap between updating the VMA and performing the actual remap, in order to do away with the legacy f_op->mmap hook. To do so, update the PFN remap code to provide shared logic, and also make remap_pfn_range_notrack() static, as its one user, io_mapping_map_user() was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c"). Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor and PFN parameters, and remap_pfn_range_complete() which accepts the same parameters as remap_pfn_rangte(). remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so it must be supplied with a correct PFN to do so. While we're here, also clean up the duplicated #ifdef __HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block. We keep these internal to mm as they should only be used by internal helpers. Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Acked-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++-- mm/internal.h | 4 ++ mm/memory.c | 132 ++++++++++++++++++++++++++++++--------------- 3 files changed, 110 insertions(+), 48 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5752b0c516f2..ca5565f4fac4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -3634,10 +3649,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/mm/internal.h b/mm/internal.h index 56a9a714709a..5ca1e7842b19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1677,4 +1677,8 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index f13b20b702f6..8e02b8d75535 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2900,6 +2900,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -2912,31 +2931,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2957,7 +2952,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); @@ -3002,23 +2997,9 @@ void pfnmap_track_ctx_release(struct kref *ref) pfnmap_untrack(ctx->pfn, ctx->size); kfree(ctx); } -#endif /* __HAVE_PFNMAP_TRACKING */ -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. - */ -#ifdef __HAVE_PFNMAP_TRACKING -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct pfnmap_track_ctx *ctx = NULL; int err; @@ -3054,15 +3035,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, return err; } +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} #else -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range_notrack(vma, addr, pfn, size, prot); } #endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); + if (err) + return err; + + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to From c707a68f9468e4ef4a3546b636a9dd088fe7b7f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:25 +0100 Subject: [PATCH 079/321] mm: abstract io_remap_pfn_range() based on PFN The only instances in which we customise this function are ones in which we customise the PFN used. Instances where architectures were not passing the pgprot value through pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so we can simply always pass pgprot through this function. Use this fact to simplify the use of io_remap_pfn_range(), by abstracting the PFN via io_remap_pfn_range_pfn() and using this instead of providing a general io_remap_pfn_range() function per-architecture. Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgtable.h | 3 --- arch/mips/alchemy/common/setup.c | 9 +++++---- arch/mips/include/asm/pgtable.h | 5 ++--- arch/sparc/include/asm/pgtable_32.h | 12 ++++-------- arch/sparc/include/asm/pgtable_64.h | 12 ++++-------- include/linux/mm.h | 19 ++++++++++++++----- 6 files changed, 29 insertions(+), 31 deletions(-) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 5a394be09c35..d606afbabce1 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -263,7 +263,4 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) - #endif /* __ASM_CSKY_PGTABLE_H */ diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c index a7a6d31a7a41..c35b4f809d51 100644 --- a/arch/mips/alchemy/common/setup.c +++ b/arch/mips/alchemy/common/setup.c @@ -94,12 +94,13 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size) return phys_addr; } -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size); - return remap_pfn_range(vma, vaddr, phys_addr >> PAGE_SHIFT, size, prot); + return phys_addr >> PAGE_SHIFT; } -EXPORT_SYMBOL(io_remap_pfn_range); +EXPORT_SYMBOL(io_remap_pfn_range_pfn); + #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index ae73ecf4c41a..9c06a612d33a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -604,9 +604,8 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, */ #ifdef CONFIG_MIPS_FIXUP_BIGPHYS_ADDR phys_addr_t fixup_bigphys_addr(phys_addr_t addr, phys_addr_t size); -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot); -#define io_remap_pfn_range io_remap_pfn_range +unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size); +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #else #define fixup_bigphys_addr(addr, size) (addr) #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index f1538a48484a..a9f802d1dd64 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -395,12 +395,8 @@ __get_iospace (unsigned long addr) #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long long offset, space, phys_base; @@ -408,9 +404,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, space = GET_IOSPACE(pfn); phys_base = offset | (space << 32ULL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 64b85ff9c766..615f460c50af 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1048,9 +1048,6 @@ int page_in_phys_avail(unsigned long paddr); #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffffffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - void adi_restore_tags(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte); @@ -1084,9 +1081,8 @@ static inline int arch_unmap_one(struct mm_struct *mm, return 0; } -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; int space = GET_IOSPACE(pfn); @@ -1094,9 +1090,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, phys_base = offset | (((unsigned long) space) << 32UL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn static inline unsigned long __untagged_addr(unsigned long start) { diff --git a/include/linux/mm.h b/include/linux/mm.h index ca5565f4fac4..4441ceec913f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3684,15 +3684,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) From db91b783290e395443151b0fe4b8dc32aceebef5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:26 +0100 Subject: [PATCH 080/321] mm: introduce io_remap_pfn_range_[prepare, complete]() We introduce the io_remap*() equivalents of remap_pfn_range_prepare() and remap_pfn_range_complete() to allow for I/O remapping via mmap_prepare. Make these internal to mm, as they should only be used by internal helpers. Link: https://lkml.kernel.org/r/4065134f13a24a3e14691b7443bcee7490b18a5c.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/internal.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/internal.h b/mm/internal.h index 5ca1e7842b19..473b29ddf85d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1681,4 +1681,22 @@ void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t pgprot); +static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc, + unsigned long orig_pfn, unsigned long size) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + + return remap_pfn_range_prepare(desc, pfn); +} + +static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, unsigned long size, + pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range_complete(vma, addr, pfn, size, prot); +} + #endif /* __MM_INTERNAL_H */ From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:27 +0100 Subject: [PATCH 081/321] mm: add ability to take further action in vm_area_desc Some drivers/filesystems need to perform additional tasks after the VMA is set up. This is typically in the form of pre-population. The forms of pre-population most likely to be performed are a PFN remap or the insertion of normal folios and PFNs into a mixed map. We start by implementing the PFN remap functionality, ensuring that we perform the appropriate actions at the appropriate time - that is setting flags at the point of .mmap_prepare, and performing the actual remap at the point at which the VMA is fully established. This prevents the driver from doing anything too crazy with a VMA at any stage, and we retain complete control over how the mm functionality is applied. Unfortunately callers still do often require some kind of custom action, so we add an optional success/error _hook to allow the caller to do something after the action has succeeded or failed. This is done at the point when the VMA has already been established, so the harm that can be done is limited. The error hook can be used to filter errors if necessary. There may be cases in which the caller absolutely must hold the file rmap lock until the operation is entirely complete. It is an edge case, but certainly the hugetlbfs mmap hook requires it. To accommodate this, we add the hide_from_rmap_until_complete flag to the mmap_action type. In this case, if a new VMA is allocated, we will hold the file rmap lock until the operation is entirely completed (including any success/error hooks). Note that we do not need to update __compat_vma_mmap() to accommodate this flag, as this function will be invoked from an .mmap handler whose VMA is not yet visible, so we implicitly hide it from the rmap. If any error arises on these final actions, we simply unmap the VMA altogether. Also update the stacked filesystem compatibility layer to utilise the action behaviour, and update the VMA tests accordingly. While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap() as we are now performing actions invoked by the mmap_prepare in addition to just the mmap_prepare hook. Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +- include/linux/mm.h | 74 ++++++++++++++++ include/linux/mm_types.h | 53 +++++++++++ mm/util.c | 146 ++++++++++++++++++++++++++++--- mm/vma.c | 113 ++++++++++++++++++------ tools/testing/vma/vma_internal.h | 98 +++++++++++++++++++-- 6 files changed, 441 insertions(+), 49 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..8cf9547a881c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2393,14 +2393,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4441ceec913f..2d060081caa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3608,6 +3608,80 @@ static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) return vma_desc_size(desc) >> PAGE_SHIFT; } +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..5021047485a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,6 +773,56 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -796,6 +846,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* diff --git a/mm/util.c b/mm/util.c index 8989d5767528..97cae40c0209 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** - * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * __compat_vma_mmap() - See description for compat_vma_mmap() * for details. This is the same operation, only with a specific file operations * struct which may or may not be the same as vma->vm_file->f_op. * @f_op: The file operations whose .mmap_prepare() hook is specified. @@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio); * @vma: The VMA to apply the .mmap_prepare() hook to. * Returns: 0 on success or error. */ -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { @@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -EXPORT_SYMBOL(__compat_vma_mmap_prepare); +EXPORT_SYMBOL(__compat_vma_mmap); /** - * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA. + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * @@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * .mmap_prepare() hook, as we are in a different context when we invoke the * .mmap() hook, already having a VMA to deal with. * - * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * compat_vma_mmap() is a compatibility function that takes VMA state, * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * @@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * * Returns: 0 on success or error. */ -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } -EXPORT_SYMBOL(compat_vma_mmap_prepare); +EXPORT_SYMBOL(compat_vma_mmap); static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) @@ -1280,6 +1283,127 @@ again: } } +static int mmap_action_finish(struct mmap_action *action, + const struct vm_area_struct *vma, int err) +{ + /* + * If an error occurs, unmap the VMA altogether and return an error. We + * only clear the newly allocated VMA, since this function is only + * invoked if we do NOT merge, so we only clean up the VMA we created. + */ + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); + } + return err; + } + + if (action->success_hook) + return action->success_hook(vma); + + return 0; +} + +#ifdef CONFIG_MMU +/** + * mmap_action_prepare - Perform preparatory setup for an VMA descriptor + * action which need to be performed. + * @desc: The VMA descriptor to prepare for @action. + * @action: The action to perform. + */ +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + remap_pfn_range_prepare(desc, action->remap.start_pfn); + break; + case MMAP_IO_REMAP_PFN: + io_remap_pfn_range_prepare(desc, action->remap.start_pfn, + action->remap.size); + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +/** + * mmap_action_complete - Execute VMA descriptor action. + * @action: The action to perform. + * @vma: The VMA to perform the action upon. + * + * Similar to mmap_action_prepare(). + * + * Return: 0 on success, or error, at which point the VMA will be unmapped. + */ +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + err = remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + case MMAP_IO_REMAP_PFN: + err = io_remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#else +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle these. */ + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle this. */ + + err = -EINVAL; + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#endif + #ifdef CONFIG_MMU /** * folio_pte_batch - detect a PTE batch for a large folio diff --git a/mm/vma.c b/mm/vma.c index eb2f711c03a1..919d1fc63a52 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -34,7 +34,9 @@ struct mmap_state { struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ - bool check_ksm_early; + bool check_ksm_early :1; + /* If we map new, hold the file rmap lock on mapping. */ + bool hold_file_rmap_lock :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -1754,7 +1756,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) unlink_file_vma_batch_process(vb); } -static void vma_link_file(struct vm_area_struct *vma) +static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; @@ -1763,7 +1765,8 @@ static void vma_link_file(struct vm_area_struct *vma) mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); + if (!hold_rmap_lock) + i_mmap_unlock_write(mapping); } } @@ -1777,7 +1780,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) vma_start_write(vma); vma_iter_store_new(&vmi, vma); - vma_link_file(vma); + vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; @@ -2311,17 +2314,33 @@ static void update_ksm_flags(struct mmap_state *map) map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } +static void set_desc_from_map(struct vm_area_desc *desc, + const struct mmap_state *map) +{ + desc->start = map->addr; + desc->end = map->end; + + desc->pgoff = map->pgoff; + desc->vm_file = map->file; + desc->vm_flags = map->vm_flags; + desc->page_prot = map->page_prot; +} + /* * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * + * As a result it sets up the @map and @desc objects. + * * @map: Mapping state. + * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ -static int __mmap_setup(struct mmap_state *map, struct list_head *uf) +static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, + struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; @@ -2378,6 +2397,7 @@ static int __mmap_setup(struct mmap_state *map, struct list_head *uf) */ vms_clean_up_area(vms, &map->mas_detach); + set_desc_from_map(desc, map); return 0; } @@ -2479,7 +2499,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma); + vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2539,6 +2559,17 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } +static void call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + + mmap_action_prepare(action, desc); + + if (action->hide_from_rmap_until_complete) + map->hold_file_rmap_lock = true; +} + /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. @@ -2550,34 +2581,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * * Returns 0 on success, or an error code otherwise. */ -static int call_mmap_prepare(struct mmap_state *map) +static int call_mmap_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { int err; - struct vm_area_desc desc = { - .mm = map->mm, - .file = map->file, - .start = map->addr, - .end = map->end, - - .pgoff = map->pgoff, - .vm_file = map->file, - .vm_flags = map->vm_flags, - .page_prot = map->page_prot, - }; /* Invoke the hook. */ - err = vfs_mmap_prepare(map->file, &desc); + err = vfs_mmap_prepare(map->file, desc); if (err) return err; + call_action_prepare(map, desc); + /* Update fields permitted to be changed. */ - map->pgoff = desc.pgoff; - map->file = desc.vm_file; - map->vm_flags = desc.vm_flags; - map->page_prot = desc.page_prot; + map->pgoff = desc->pgoff; + map->file = desc->vm_file; + map->vm_flags = desc->vm_flags; + map->page_prot = desc->page_prot; /* User-defined fields. */ - map->vm_ops = desc.vm_ops; - map->vm_private_data = desc.private_data; + map->vm_ops = desc->vm_ops; + map->vm_private_data = desc->private_data; return 0; } @@ -2619,22 +2642,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } +static int call_action_complete(struct mmap_state *map, + struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ + struct mmap_action *action = &desc->action; + int ret; + + ret = mmap_action_complete(action, vma); + + /* If we held the file rmap we need to release it. */ + if (map->hold_file_rmap_lock) { + struct file *file = vma->vm_file; + + i_mmap_unlock_write(file->f_mapping); + } + return ret; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + struct vm_area_desc desc = { + .mm = mm, + .file = file, + .action = { + .type = MMAP_NOTHING, /* Default to no further action. */ + }, + }; + bool allocated_new = false; + int error; map.check_ksm_early = can_set_ksm_flags_early(&map); - error = __mmap_setup(&map, uf); + error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) - error = call_mmap_prepare(&map); + error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; @@ -2653,6 +2702,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; + allocated_new = true; } if (have_mmap_prepare) @@ -2660,6 +2710,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); + if (have_mmap_prepare && allocated_new) { + error = call_action_complete(&map, &desc, vma); + + if (error) + return error; + } + return addr; /* Accounting was done by __mmap_setup(). */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..d873667704e8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -275,6 +275,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -298,6 +349,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -1326,12 +1380,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1404,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1442,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1475,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ From da003453dce728857bea2e3de74132a90c9c78e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:28 +0100 Subject: [PATCH 082/321] doc: update porting, vfs documentation for mmap_prepare actions Now we have introduced the ability to specify that actions should be taken after a VMA is established via the vm_area_desc->action field as specified in mmap_prepare, update both the VFS documentation and the porting guide to describe this. Link: https://lkml.kernel.org/r/472ce3da7662ed1065cc299d14bffb70b1a845e7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/filesystems/porting.rst | 5 +++++ Documentation/filesystems/vfs.rst | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..b7ddf89103c7 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1286,6 +1286,11 @@ The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. +In nearly all cases, this is all that is required for a filesystem. However, if +a filesystem needs to perform an operation such a pre-population of page tables, +then that action can be specified in the vm_area_desc->action field, which can +be configured using the mmap_action_*() helpers. + --- **mandatory** diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 4f13b01e42eb..670ba66b60e4 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1213,6 +1213,10 @@ otherwise noted. file-backed memory mapping, most notably establishing relevant private state and VMA callbacks. + If further action such as pre-population of page tables is required, + this can be specified by the vm_area_desc->action field and related + parameters. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special From ea52cb24cd3fb121283754ab82b2cb3044609359 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:29 +0100 Subject: [PATCH 083/321] mm/hugetlbfs: update hugetlbfs to use mmap_prepare Since we can now perform actions after the VMA is established via mmap_prepare, use desc->action_success_hook to set up the hugetlb lock once the VMA is setup. We also make changes throughout hugetlbfs to make this possible. Note that we must hide newly established hugetlb VMAs from the rmap until the operation is entirely complete as we establish a hugetlb lock during VMA setup that can be raced by rmap users. Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Tested-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 46 +++++++++++++------ include/linux/hugetlb.h | 9 +++- include/linux/hugetlb_inline.h | 15 ++++--- mm/hugetlb.c | 81 ++++++++++++++++++++-------------- 4 files changed, 97 insertions(+), 54 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ce8e40d35032..3919fca56553 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) { + /* Unfortunate we have to reassign vma->vm_private_data. */ + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); +} + +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); - vma->vm_ops = &hugetlb_vm_ops; + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + desc->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + if (desc->pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vma_len = (loff_t)vma_desc_size(desc); + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; - vm_flags = vma->vm_flags; + vm_flags = desc->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags |= VM_NORESERVE; if (hugetlb_reserve_pages(inode, - vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma, - vm_flags) < 0) + desc->pgoff >> huge_page_order(h), + len >> huge_page_shift(h), desc, + vm_flags) < 0) goto out; ret = 0; - if (vma->vm_flags & VM_WRITE && inode->i_size < len) + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); + if (!ret) { + /* Allocate the VMA lock after we set it up. */ + desc->action.success_hook = hugetlb_file_mmap_prepare_success; + /* + * We cannot permit the rmap finding this VMA in the time + * between the VMA being inserted into the VMA tree and the + * completion/success hook being invoked. + * + * This is because we establish a per-VMA hugetlb lock which can + * be raced by rmap. + */ + desc->action.hide_from_rmap_until_complete = true; + } return ret; } @@ -1220,7 +1240,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap = hugetlbfs_file_mmap, + .mmap_prepare = hugetlbfs_file_mmap_prepare, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..2387513d6ae5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7774c286b3b7..86e672fcb305 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -438,17 +437,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +/* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; + return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return; + return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { @@ -463,13 +466,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; + return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; + + return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1201,22 +1206,30 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) -{ - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); - - set_vma_private_data(vma, (unsigned long)map); -} - static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = map; +} + +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = (void *)((unsigned long)desc->private_data | flags); +} + static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); @@ -1224,6 +1237,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + + return ((unsigned long)desc->private_data) & flag; +} + bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -7270,9 +7290,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, */ long hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) + long from, long to, + struct vm_area_desc *desc, + vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -7287,12 +7307,6 @@ long hugetlb_reserve_pages(struct inode *inode, return -EINVAL; } - /* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ - hugetlb_vma_lock_alloc(vma); - /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -7305,9 +7319,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !vma is a shm mapping + * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -7324,8 +7338,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + set_vma_desc_resv_map(desc, resv_map); + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) @@ -7335,7 +7349,7 @@ long hugetlb_reserve_pages(struct inode *inode, chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -7369,7 +7383,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -7423,16 +7437,15 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - hugetlb_vma_lock_free(vma); - if (!vma || vma->vm_flags & VM_MAYSHARE) + if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_resv_map(vma, NULL); + set_vma_desc_resv_map(desc, NULL); } return chg < 0 ? chg : add < 0 ? add : -EINVAL; } From 89646d9c748c0902600090f37ae585f3b99deb4d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:30 +0100 Subject: [PATCH 084/321] mm: add shmem_zero_setup_desc() Add the ability to set up a shared anonymous mapping based on a VMA descriptor rather than a VMA. This is a prerequisite for converting to the char mm driver to use the mmap_prepare hook. Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 3 ++- mm/shmem.c | 41 ++++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..5b368f9549d6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); diff --git a/mm/shmem.c b/mm/shmem.c index 8b9fcdd144c8..da1df4270309 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5880,14 +5880,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -5895,7 +5890,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -5907,6 +5913,25 @@ int shmem_zero_setup(struct vm_area_struct *vma) return 0; } +/** + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space From ab04945f91bcad1668af57bbb575771e794aea8d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:31 +0100 Subject: [PATCH 085/321] mm: update mem char driver to use mmap_prepare Update the mem char driver (backing /dev/mem and /dev/zero) to use f_op->mmap_prepare hook rather than the deprecated f_op->mmap. The /dev/zero implementation has a very unique and rather concerning characteristic in that it converts MAP_PRIVATE mmap() mappings anonymous when they are, in fact, not. The new f_op->mmap_prepare() can support this, but rather than introducing a helper function to perform this hack (and risk introducing other users), utilise the success hook to do so. We utilise the newly introduced shmem_zero_setup_desc() to allow for the shared mapping case via an f_op->mmap_prepare() hook. We also use the desc->action_error_hook to filter the remap error to -EAGAIN to keep behaviour consistent. Link: https://lkml.kernel.org/r/48f60764d7a6901819d1af778fa33b775d2e8c77.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/char/mem.c | 84 +++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index db1ca53a6d01..52039fae1594 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -304,13 +304,13 @@ static unsigned zero_mmap_capabilities(struct file *file) } /* can't do an in-place private mapping if there's no MMU */ -static inline int private_mapping_ok(struct vm_area_struct *vma) +static inline int private_mapping_ok(struct vm_area_desc *desc) { - return is_nommu_shared_mapping(vma->vm_flags); + return is_nommu_shared_mapping(desc->vm_flags); } #else -static inline int private_mapping_ok(struct vm_area_struct *vma) +static inline int private_mapping_ok(struct vm_area_desc *desc) { return 1; } @@ -322,46 +322,49 @@ static const struct vm_operations_struct mmap_mem_ops = { #endif }; -static int mmap_mem(struct file *file, struct vm_area_struct *vma) +static int mmap_filter_error(int err) { - size_t size = vma->vm_end - vma->vm_start; - phys_addr_t offset = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; + return -EAGAIN; +} + +static int mmap_mem_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; + const size_t size = vma_desc_size(desc); + const phys_addr_t offset = (phys_addr_t)desc->pgoff << PAGE_SHIFT; /* Does it even fit in phys_addr_t? */ - if (offset >> PAGE_SHIFT != vma->vm_pgoff) + if (offset >> PAGE_SHIFT != desc->pgoff) return -EINVAL; /* It's illegal to wrap around the end of the physical address space. */ if (offset + (phys_addr_t)size - 1 < offset) return -EINVAL; - if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size)) + if (!valid_mmap_phys_addr_range(desc->pgoff, size)) return -EINVAL; - if (!private_mapping_ok(vma)) + if (!private_mapping_ok(desc)) return -ENOSYS; - if (!range_is_allowed(vma->vm_pgoff, size)) + if (!range_is_allowed(desc->pgoff, size)) return -EPERM; - if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size, - &vma->vm_page_prot)) + if (!phys_mem_access_prot_allowed(file, desc->pgoff, size, + &desc->page_prot)) return -EINVAL; - vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff, - size, - vma->vm_page_prot); + desc->page_prot = phys_mem_access_prot(file, desc->pgoff, + size, + desc->page_prot); - vma->vm_ops = &mmap_mem_ops; + desc->vm_ops = &mmap_mem_ops; + + /* Remap-pfn-range will mark the range VM_IO. */ + mmap_action_remap_full(desc, desc->pgoff); + /* We filter remap errors to -EAGAIN. */ + desc->action.error_hook = mmap_filter_error; - /* Remap-pfn-range will mark the range VM_IO */ - if (remap_pfn_range(vma, - vma->vm_start, - vma->vm_pgoff, - size, - vma->vm_page_prot)) { - return -EAGAIN; - } return 0; } @@ -501,14 +504,26 @@ static ssize_t read_zero(struct file *file, char __user *buf, return cleared; } -static int mmap_zero(struct file *file, struct vm_area_struct *vma) +static int mmap_zero_private_success(const struct vm_area_struct *vma) +{ + /* + * This is a highly unique situation where we mark a MAP_PRIVATE mapping + * of /dev/zero anonymous, despite it not being. + */ + vma_set_anonymous((struct vm_area_struct *)vma); + + return 0; +} + +static int mmap_zero_prepare(struct vm_area_desc *desc) { #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma->vm_flags & VM_SHARED) - return shmem_zero_setup(vma); - vma_set_anonymous(vma); + if (desc->vm_flags & VM_SHARED) + return shmem_zero_setup_desc(desc); + + desc->action.success_hook = mmap_zero_private_success; return 0; } @@ -526,10 +541,11 @@ static unsigned long get_unmapped_area_zero(struct file *file, { if (flags & MAP_SHARED) { /* - * mmap_zero() will call shmem_zero_setup() to create a file, - * so use shmem's get_unmapped_area in case it can be huge; - * and pass NULL for file as in mmap.c's get_unmapped_area(), - * so as not to confuse shmem with our handle on "/dev/zero". + * mmap_zero_prepare() will call shmem_zero_setup() to create a + * file, so use shmem's get_unmapped_area in case it can be + * huge; and pass NULL for file as in mmap.c's + * get_unmapped_area(), so as not to confuse shmem with our + * handle on "/dev/zero". */ return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags); } @@ -632,7 +648,7 @@ static const struct file_operations __maybe_unused mem_fops = { .llseek = memory_lseek, .read = read_mem, .write = write_mem, - .mmap = mmap_mem, + .mmap_prepare = mmap_mem_prepare, .open = open_mem, #ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, @@ -668,7 +684,7 @@ static const struct file_operations zero_fops = { .write_iter = write_iter_zero, .splice_read = copy_splice_read, .splice_write = splice_write_zero, - .mmap = mmap_zero, + .mmap_prepare = mmap_zero_prepare, .get_unmapped_area = get_unmapped_area_zero, #ifndef CONFIG_MMU .mmap_capabilities = zero_mmap_capabilities, From 8247e2600e5348a364c393a75fd204846d4092da Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:32 +0100 Subject: [PATCH 086/321] mm: update resctl to use mmap_prepare Make use of the ability to specify a remap action within mmap_prepare to update the resctl pseudo-lock to use mmap_prepare in favour of the deprecated mmap hook. Link: https://lkml.kernel.org/r/95b28b066f37ca25f56fa9460a9367f1a866f88b.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Reinette Chatre Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/resctrl/pseudo_lock.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 87bbc2605de1..0bfc13c5b96d 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -995,10 +995,11 @@ static const struct vm_operations_struct pseudo_mmap_ops = { .mremap = pseudo_lock_dev_mremap, }; -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) { - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long off = desc->pgoff << PAGE_SHIFT; + unsigned long vsize = vma_desc_size(desc); + struct file *filp = desc->file; struct pseudo_lock_region *plr; struct rdtgroup *rdtgrp; unsigned long physical; @@ -1043,7 +1044,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!(vma->vm_flags & VM_SHARED)) { + if (!(desc->vm_flags & VM_SHARED)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } @@ -1055,12 +1056,9 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) memset(plr->kmem + off, 0, vsize); - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; + desc->vm_ops = &pseudo_mmap_ops; + mmap_action_remap_full(desc, physical + desc->pgoff); + mutex_unlock(&rdtgroup_mutex); return 0; } @@ -1071,7 +1069,7 @@ static const struct file_operations pseudo_lock_dev_fops = { .write = NULL, .open = pseudo_lock_dev_open, .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, + .mmap_prepare = pseudo_lock_dev_mmap_prepare, }; int rdt_pseudo_lock_init(void) From 184c7533426ada7625c029b54143fec4ef4daa28 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 20 Oct 2025 11:49:33 +0700 Subject: [PATCH 087/321] vmalloc: separate gfp_mask adjunctive parentheses in __vmalloc_node_noprof() kernel-doc comment Sphinx reports htmldocs warning on __vmalloc_node() comment: Documentation/core-api/mm-api:52: ./mm/vmalloc.c:4036: WARNING: Inline strong start-string without end-string. [docutils] Fix it by separating adjunctive parentheses from preceding gfp_mask formatting markup. Link: https://lkml.kernel.org/r/20251020044933.15222-1-bagasdotme@gmail.com Fixes: 32904ba6f5ef ("vmalloc: update __vmalloc_node_noprof() documentation") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20251020134902.3a11107e@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Reviewed-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Stehen Rothwell Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e207ca64a688..091a07f6d925 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4034,7 +4034,7 @@ fail: * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * - * Semantics of @gfp_mask(including reclaim/retry modifiers such as + * Semantics of @gfp_mask (including reclaim/retry modifiers such as * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof(). * * Return: pointer to the allocated memory or %NULL on error From f0c74b6cb91d97cea74176a5400e89c183732cb2 Mon Sep 17 00:00:00 2001 From: Mehdi Ben Hadj Khelifa Date: Sat, 18 Oct 2025 21:11:48 +0100 Subject: [PATCH 088/321] mm/vmalloc: use kmalloc_array() instead of kmalloc() The number of NUMA nodes (nr_node_ids) is bounded, so overflow is not a practical concern here. However, using kmalloc_array() better reflects the intent to allocate an array of unsigned ints, and improves consistency with other NUMA-related allocations. No functional change intended. Link: https://lkml.kernel.org/r/20251018201207.27441-1-mehdi.benhadjkhelifa@gmail.com Signed-off-by: Mehdi Ben Hadj Khelifa Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Khalid Aziz Cc: David Hunter Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 091a07f6d925..adde450ddf5e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5140,7 +5140,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) unsigned int *counters; if (IS_ENABLED(CONFIG_NUMA)) - counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); From fae4e86024bdcf23fc87b47122a11b93c009d570 Mon Sep 17 00:00:00 2001 From: Swaraj Gaikwad Date: Tue, 21 Oct 2025 21:53:24 +0000 Subject: [PATCH 089/321] mm/damon/sysfs: remove misleading todo comment in nid_show() The TODO comment in nid_show() suggested returning an error if the goal was not using nid. However, this comment was found to be inaccurate and misleading.This patch removes the TODO comment without changing any existing behavior. This change follows feedback from SJ who pointed out [1] that wiring-order independence is expected and the function should simply show the last set value. and [2] checkpatch.pl complain about number of chars per line No functional code changes were made. Tested with KUnit: - Built kernel with KUnit and DAMON sysfs tests enabled. - Executed KUnit tests: ./tools/testing/kunit/kunit.py run --kunitconfig ./mm/damon/tests/ - All 25 tests passed, including damon_sysfs_test_add_targets. Link: https://lkml.kernel.org/r/20251021215323.29734-2-swarajgaikwad1925@gmail.com Link: https://lore.kernel.org/lkml/20251020151315.66260-1-sj@kernel.org/ [1] Link: https://lore.kernel.org/lkml/20251021010847.68473-1-sj@kernel.org/ [2] Signed-off-by: Swaraj Gaikwad Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Cc: David Hunter Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index c98cf4bd2fbb..30d20f5b3192 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1121,7 +1121,6 @@ static ssize_t nid_show(struct kobject *kobj, struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); - /* todo: return error if the goal is not using nid */ return sysfs_emit(buf, "%d\n", goal->nid); } From 2da6fe91c29c5402ede36bcd286696c227b99020 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 21 Oct 2025 21:44:31 +0800 Subject: [PATCH 090/321] mm/khugepaged: factor out common logic in [scan,alloc]_sleep_millisecs_store() Both scan_sleep_millisecs_store() and alloc_sleep_millisecs_store() perform the same operations: parse the input value, update their respective sleep interval, reset khugepaged_sleep_expire, and wake up the khugepaged thread. Factor out this duplicated logic into a helper function __sleep_millisecs_store(), and simplify both store functions. No functional change intended. Link: https://lkml.kernel.org/r/20251021134431.26488-1-leon.hwang@linux.dev Signed-off-by: Leon Hwang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Reviewed-by: Lance Yang Acked-by: David Hildenbrand Acked-by: Nico Pache Reviewed-by: Dev Jain Reviewed-by: SeongJae Park Cc: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 68e487d53772..643abf4be236 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -129,9 +129,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t __sleep_millisecs_store(const char *buf, size_t count, + unsigned int *millisecs) { unsigned int msecs; int err; @@ -140,12 +139,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, if (err) return -EINVAL; - khugepaged_scan_sleep_millisecs = msecs; + *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); +} static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); @@ -160,18 +166,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned int msecs; - int err; - - err = kstrtouint(buf, 10, &msecs); - if (err) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; + return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); From fe62415c9b0da808dab7507dacff588e329f8ff3 Mon Sep 17 00:00:00 2001 From: William Kucharski Date: Tue, 21 Oct 2025 05:00:04 -0600 Subject: [PATCH 091/321] mm: remove reference to destructor in comment in calculate_sizes() The commit that removed support for destructors from kmem_cache_alloc() never removed the comment regarding destructors in the explanation of the possible relocation of the free pointer in calculate_sizes(). Link: https://lkml.kernel.org/r/20251021110004.2209008-1-william.kucharski@oracle.com Fixes: 20c2df83d25c ("mm: Remove slab destructors from kmem_cache_create().") Signed-off-by: William Kucharski Acked-by: Roman Gushchin Reviewed-by: Christoph Lameter (Ampere) Acked-by: SeongJae Park Cc: David Rientjes Cc: Paul Mundt Cc: Vlastimil Babka Cc: Harry Yoo Signed-off-by: Andrew Morton --- mm/slub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3ea9b7af660d..477c4f741b2a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7901,11 +7901,11 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) * permitted to overwrite the first word of the object on * kmem_cache_free. * - * This is the case if we do RCU, have a constructor or - * destructor, are poisoning the objects, or are - * redzoning an object smaller than sizeof(void *) or are - * redzoning an object with slub_debug_orig_size() enabled, - * in which case the right redzone may be extended. + * This is the case if we do RCU, have a constructor, are + * poisoning the objects, or are redzoning an object smaller + * than sizeof(void *) or are redzoning an object with + * slub_debug_orig_size() enabled, in which case the right + * redzone may be extended. * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the From 645a3c4243473d5c8b780927d2cc573e3da5a20c Mon Sep 17 00:00:00 2001 From: Jing Su Date: Tue, 21 Oct 2025 15:37:07 +0800 Subject: [PATCH 092/321] mm/vmstat: fix indentation in fold_diff function Adjust misaligned braces in fold_diff() to improve code readability and maintain consistent coding style. [akpm@linux-foundation.org: add braces, per Vlastimil & Liam] Link: https://lkml.kernel.org/r/aPc4I/8zXCGyiapN@pilot-ThinkCentre-M930t-N000 Signed-off-by: Jing Su Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Donet Tom Acked-by: Vlastimil Babka Reviewed-by: Dev Jain Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmstat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 98855f31294d..f226942db746 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -778,17 +778,20 @@ static int fold_diff(int *zone_diff, int *node_diff) int i; bool changed = false; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); changed = true; + } } - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); changed = true; + } } + return changed; } From a0615780439938e8e61343f1f92a4c54a71dc6a5 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 21 Oct 2025 12:44:56 -0700 Subject: [PATCH 093/321] mm/vmalloc: request large order pages from buddy allocator Sometimes, vm_area_alloc_pages() will want many pages from the buddy allocator. Rather than making requests to the buddy allocator for at most 100 pages at a time, we can eagerly request large order pages a smaller number of times. We still split the large order pages down to order-0 as the rest of the vmalloc code (and some callers) depend on it. We still defer to the bulk allocator and fallback path in case of order-0 pages or failure. Running 1000 iterations of allocations on a small 4GB system finds: 1000 2mb allocations: [Baseline] [This patch] real 46.310s real 0m34.582 user 0.001s user 0.006s sys 46.058s sys 0m34.365s 10000 200kb allocations: [Baseline] [This patch] real 56.104s real 0m43.696 user 0.001s user 0.003s sys 55.375s sys 0m42.995s Link: https://lkml.kernel.org/r/20251021194455.33351-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index adde450ddf5e..0832f944544c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3619,8 +3619,44 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; + unsigned int nr_remaining = nr_pages; + unsigned int max_attempt_order = MAX_PAGE_ORDER; struct page *page; int i; + gfp_t large_gfp = (gfp & + ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL | __GFP_COMP)) + | __GFP_NOWARN; + unsigned int large_order = ilog2(nr_remaining); + + large_order = min(max_attempt_order, large_order); + + /* + * Initially, attempt to have the page allocator give us large order + * pages. Do not attempt allocating smaller than order chunks since + * __vmap_pages_range() expects physically contigous pages of exactly + * order long chunks. + */ + while (large_order > order && nr_remaining) { + if (nid == NUMA_NO_NODE) + page = alloc_pages_noprof(large_gfp, large_order); + else + page = alloc_pages_node_noprof(nid, large_gfp, large_order); + + if (unlikely(!page)) { + max_attempt_order = --large_order; + continue; + } + + split_page(page, large_order); + for (i = 0; i < (1U << large_order); i++) + pages[nr_allocated + i] = page + i; + + nr_allocated += 1U << large_order; + nr_remaining = nr_pages - nr_allocated; + + large_order = ilog2(nr_remaining); + large_order = min(max_attempt_order, large_order); + } /* * For order-0 pages we make use of bulk allocator, if From 5ff592bec75ad79ed7f1a817477ab6eef8dc5efc Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 21 Oct 2025 16:44:25 -0700 Subject: [PATCH 094/321] memcg: manually uninline __memcg_memory_event __memcg_memory_event() has been unnecessarily marked inline even when it is not really performance critical. It is usually called to track extreme conditions. Over the time, it has evolved to include more functionality and inlining it is causing more harm. Before the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35645 10574 4192 50411 c4eb mm/memcontrol.o 54738 1658 0 56396 dc4c net/ipv4/tcp_input.o 34644 1065 0 35709 8b7d net/ipv4/tcp_output.o After the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35137 10446 4192 49775 c26f mm/memcontrol.o 54322 1562 0 55884 da4c net/ipv4/tcp_input.o 34492 1017 0 35509 8ab5 net/ipv4/tcp_output.o [akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph] Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: SeongJae Park Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 32 ++------------------------------ mm/memcontrol.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe254813123..8c0f15e5978f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,36 +1002,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 976412c8196e..025da46d9959 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1626,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg) return page_counter_read(&memcg->memory); } +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning) +{ + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + + atomic_long_inc(&memcg->memory_events_local[event]); + if (!swap_event && allow_spinning) + cgroup_file_notify(&memcg->events_local_file); + + do { + atomic_long_inc(&memcg->memory_events[event]); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + break; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); +} +EXPORT_SYMBOL_GPL(__memcg_memory_event); + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { From 72f98ef9a4be30d2a60136dd6faee376f780d06c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:27 +0800 Subject: [PATCH 095/321] iommu: disable SVA when CONFIG_X86 is set Patch series "Fix stale IOTLB entries for kernel address space", v7. This proposes a fix for a security vulnerability related to IOMMU Shared Virtual Addressing (SVA). In an SVA context, an IOMMU can cache kernel page table entries. When a kernel page table page is freed and reallocated for another purpose, the IOMMU might still hold stale, incorrect entries. This can be exploited to cause a use-after-free or write-after-free condition, potentially leading to privilege escalation or data corruption. This solution introduces a deferred freeing mechanism for kernel page table pages, which provides a safe window to notify the IOMMU to invalidate its caches before the page is reused. This patch (of 8): In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. The x86 architecture maps the kernel's virtual address space into the upper portion of every process's page table. Consequently, in an SVA context, the IOMMU hardware can walk and cache kernel page table entries. The Linux kernel currently lacks a notification mechanism for kernel page table changes, specifically when page table pages are freed and reused. The IOMMU driver is only notified of changes to user virtual address mappings. This can cause the IOMMU's internal caches to retain stale entries for kernel VA. Use-After-Free (UAF) and Write-After-Free (WAF) conditions arise when kernel page table pages are freed and later reallocated. The IOMMU could misinterpret the new data as valid page table entries. The IOMMU might then walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation. This is also a Write-After-Free issue, as the IOMMU will potentially continue to write Accessed and Dirty bits to the freed memory while attempting to walk the stale page tables. Currently, SVA contexts are unprivileged and cannot access kernel mappings. However, the IOMMU will still walk kernel-only page tables all the way down to the leaf entries, where it realizes the mapping is for the kernel and errors out. This means the IOMMU still caches these intermediate page table entries, making the described vulnerability a real concern. Disable SVA on x86 architecture until the IOMMU can receive notification to flush the paging cache before freeing the CPU kernel page table pages. Link: https://lkml.kernel.org/r/20251022082635.2462433-1-baolu.lu@linux.intel.com Link: https://lkml.kernel.org/r/20251022082635.2462433-2-baolu.lu@linux.intel.com Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Signed-off-by: Lu Baolu Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Kevin Tian Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Cc: Signed-off-by: Andrew Morton --- drivers/iommu/iommu-sva.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..a0442faad952 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -77,6 +77,9 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (!group) return ERR_PTR(-ENODEV); + if (IS_ENABLED(CONFIG_X86)) + return ERR_PTR(-EOPNOTSUPP); + mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:28 +0800 Subject: [PATCH 096/321] mm: add a ptdesc flag to mark kernel page tables The page tables used to map the kernel and userspace often have very different handling rules. There are frequently *_kernel() variants of functions just for kernel page tables. That's not great and has lead to code duplication. Instead of having completely separate call paths, allow a 'ptdesc' to be marked as being for kernel mappings. Introduce helpers to set and clear this status. Note: this uses the PG_referenced bit. Page flags are a great fit for this since it is truly a single bit of information. Use PG_referenced itself because it's a fairly benign flag (as opposed to things like PG_lock). It's also (according to Willy) unlikely to go away any time soon. PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be cleared before freeing the page, and pages coming out of the allocator should have it cleared. Regardless, introduce an API to clear it anyway. Having symmetry in the API makes it easier to change the underlying implementation later, like if there was a need to move to a PAGE_FLAGS_CHECK_AT_FREE bit. Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d060081caa5..5c887c4ea29e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2962,6 +2962,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2987,6 +2988,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) return test_bit(PT_reserved, &pt->pt_flags.f); } +/** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:29 +0800 Subject: [PATCH 097/321] mm: actually mark kernel page table pages Now that the API is in place, mark kernel page table pages just after they are allocated. Unmark them just before they are freed. Note: Unconditionally clearing the 'kernel' marking (via ptdesc_clear_kernel()) would be functionally identical to what is here. But having the if() makes it logically clear that this function can be used for kernel and non-kernel page tables. Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 18 ++++++++++++++++++ include/linux/mm.h | 3 +++ 2 files changed, 21 insertions(+) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..b9d2a7c79b93 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) return NULL; } + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad pagetable_free(ptdesc); return NULL; } + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) @@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_pud_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) @@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_p4d_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order return NULL; pagetable_pgd_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c887c4ea29e..8f46048875a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,6 +3057,9 @@ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); + if (ptdesc_test_kernel(pt)) + ptdesc_clear_kernel(pt); + __free_pages(page, compound_order(page)); } From 412d000346ea38ac4b9bb715a86c73ef89d90dea Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:30 +0800 Subject: [PATCH 098/321] x86/mm: use 'ptdesc' when freeing PMD pages There are a billion ways to refer to a physical memory address. One of the x86 PMD freeing code location chooses to use a 'pte_t *' to point to a PMD page and then call a PTE-specific freeing function for it. That's a bit wonky. Just use a 'struct ptdesc *' instead. Its entire purpose is to refer to page table pages. It also means being able to remove an explicit cast. Right now, pte_free_kernel() is a one-liner that calls pagetable_dtor_free(). Effectively, all this patch does is remove one superfluous __pa(__va(paddr)) conversion and then call pagetable_dtor_free() directly instead of through a helper. Link: https://lkml.kernel.org/r/20251022082635.2462433-5-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- arch/x86/mm/pgtable.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ddf248c3ee7d..2e5ecfdce73c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -729,7 +729,7 @@ int pmd_clear_huge(pmd_t *pmd) int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; - pte_t *pte; + struct ptdesc *pt; int i; pmd = pud_pgtable(*pud); @@ -750,8 +750,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { - pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); - pte_free_kernel(&init_mm, pte); + pt = page_ptdesc(pmd_page(pmd_sv[i])); + pagetable_dtor_free(pt); } } @@ -772,15 +772,15 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { - pte_t *pte; + struct ptdesc *pt; - pte = (pte_t *)pmd_page_vaddr(*pmd); + pt = page_ptdesc(pmd_page(*pmd)); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); - pte_free_kernel(&init_mm, pte); + pagetable_dtor_free(pt); return 1; } From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:31 +0800 Subject: [PATCH 099/321] mm: introduce pure page table freeing function The pages used for ptdescs are currently freed back to the allocator in a single location. They will shortly be freed from a second location. Create a simple helper that just frees them back to the allocator. Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f46048875a7..88c0a0fae43a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3046,6 +3046,13 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3055,12 +3062,10 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - if (ptdesc_test_kernel(pt)) ptdesc_clear_kernel(pt); - __free_pages(page, compound_order(page)); + __pagetable_free(pt); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) From bf9e4e30f3538391745a99bc2268ec4f5e4a401e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:32 +0800 Subject: [PATCH 100/321] x86/mm: use pagetable_free() The kernel's memory management subsystem provides a dedicated interface, pagetable_free(), for freeing page table pages. Updates two call sites to use pagetable_free() instead of the lower-level __free_page() or free_pages(). This improves code consistency and clarity, and ensures the correct freeing mechanism is used. Link: https://lkml.kernel.org/r/20251022082635.2462433-7-baolu.lu@linux.intel.com Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Kevin Tian Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pat/set_memory.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0e4270e20fad..3d9a5e4ccaa4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(struct page *page, int order) free_reserved_pages(page, nr_pages); #endif } else { - __free_pages(page, order); + pagetable_free(page_ptdesc(page)); } } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 970981893c9b..fffb6ef1997d 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -429,7 +429,7 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa) list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { list_del(&ptdesc->pt_list); - __free_page(ptdesc_page(ptdesc)); + pagetable_free(ptdesc); } } From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:33 +0800 Subject: [PATCH 101/321] mm: introduce deferred freeing for kernel page tables This introduces a conditional asynchronous mechanism, enabled by CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the freeing of pages that are used as page tables for kernel address mappings. These pages are now queued to a work struct instead of being freed immediately. This deferred freeing allows for batch-freeing of page tables, providing a safe context for performing a single expensive operation (TLB flush) for a batch of kernel page tables instead of performing that expensive operation for each page table. Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++--- mm/Kconfig | 3 +++ mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 88c0a0fae43a..a6fd9f5aaf30 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt) */ static inline void pagetable_free(struct ptdesc *pt) { - if (ptdesc_test_kernel(pt)) + if (ptdesc_test_kernel(pt)) { ptdesc_clear_kernel(pt); - - __pagetable_free(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) diff --git a/mm/Kconfig b/mm/Kconfig index 4971436c8697..682a5c39a1a6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +config ASYNC_KERNEL_PGTABLE_FREE + def_bool n + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..1c7caa8ef164 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -406,3 +406,40 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:34 +0800 Subject: [PATCH 102/321] iommu/sva: invalidate stale IOTLB entries for kernel address space Introduce a new IOMMU interface to flush IOTLB paging cache entries for the CPU kernel address space. This interface is invoked from the x86 architecture code that manages combined user and kernel page tables, specifically before any kernel page table page is freed and reused. This addresses the main issue with vfree() which is a common occurrence and can be triggered by unprivileged users. While this resolves the primary problem, it doesn't address some extremely rare case related to memory unplug of memory that was present as reserved memory at boot, which cannot be triggered by unprivileged users. The discussion can be found at the link below. Enable SVA on x86 architecture since the IOMMU can now receive notification to flush the paging cache before freeing the CPU kernel page table pages. Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/ Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Suggested-by: Jann Horn Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++---- include/linux/iommu.h | 4 ++++ mm/pgtable-generic.c | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..a3700766a8c0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -279,6 +279,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index a0442faad952..d236aef80a8d 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static bool iommu_sva_present; +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (!group) return ERR_PTR(-ENODEV); - if (IS_ENABLED(CONFIG_X86)) - return ERR_PTR(-EOPNOTSUPP); - mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ @@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains); + if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = true; + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = false; + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + guard(mutex)(&iommu_sva_lock); + if (!iommu_sva_present) + return; + + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..66e4abb2df0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1134,7 +1134,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1c7caa8ef164..8c22be79b734 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(struct work_struct *work) list_splice_tail_init(&kernel_pgtable_work.list, &page_list); spin_unlock(&kernel_pgtable_work.lock); + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); list_for_each_entry_safe(pt, next, &page_list, pt_list) __pagetable_free(pt); } From eca1fba23344ef6e289548bddd59127734873264 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 22 Oct 2025 04:29:51 +0100 Subject: [PATCH 103/321] mm/debug_vm_pgtable: add [pte|pmd]_mkwrite_novma() tests Add some [pte|pmd]_mkwrite_novma() relevant tests. [anshuman.khandual@arm.com: add a new test combination per Huang Ying] Link: https://lkml.kernel.org/r/20251024013137.136926-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20251022032951.3498553-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Reviewed-by: Huang Ying Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 830107b6dd08..133543ca2821 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -102,6 +102,12 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte, args->vma)))); WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte)))); WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); + + WARN_ON(!pte_dirty(pte_mkwrite_novma(pte_mkdirty(pte)))); + WARN_ON(pte_dirty(pte_mkwrite_novma(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkdirty(pte_mkwrite_novma(pte)))); + WARN_ON(!pte_write(pte_mkwrite_novma(pte_wrprotect(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite_novma(pte)))); } static void __init pte_advanced_tests(struct pgtable_debug_args *args) @@ -195,6 +201,13 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd, args->vma)))); WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd)))); WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd)))); + + WARN_ON(!pmd_dirty(pmd_mkwrite_novma(pmd_mkdirty(pmd)))); + WARN_ON(pmd_dirty(pmd_mkwrite_novma(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkdirty(pmd_mkwrite_novma(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite_novma(pmd_wrprotect(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite_novma(pmd)))); + /* * A huge page does not point to next level page table * entry. Hence this must qualify as pmd_bad(). From ef0258857d40dd8e33c52e28ea3a1c82099a6879 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 21 Oct 2025 21:21:39 +0000 Subject: [PATCH 104/321] mm/huge_memory: avoid reinvoking folio_test_anon() Patch series "mm/huge_memory: cleanup __split_unmapped_folio()", v3. This patch series cleans up and optimizes the internal logic of the __split_unmapped_folio() function. The goal is to improve clarity and efficiency by eliminating redundant checks, caching stable attribute values, and simplifying the iteration logic used for updating folio statistics. These changes make the code easier to follow and maintain. The split_huge_page_test selftest pass. This patch (of 4): During the execution of __split_unmapped_folio(), the folio's anon/!anon attribute is invariant (not expected to change). Therefore, it is safe and more efficient to retrieve this attribute once at the start and reuse it throughout the function. Link: https://lkml.kernel.org/r/20251021212142.25766-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251016004613.514-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251016004613.514-2-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251021212142.25766-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Zi Yan Reviewed-by: wang lian Reviewed-by: Barry Song Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Cc: Nico Pache Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ffdf2ccf8269..5d95e3462c43 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3401,6 +3401,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, struct address_space *mapping, bool uniform_split) { + const bool is_anon = folio_test_anon(folio); int order = folio_order(folio); int start_order = uniform_split ? new_order : order - 1; bool stop_split = false; @@ -3408,7 +3409,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, int split_order; int ret = 0; - if (folio_test_anon(folio)) + if (is_anon) mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); /* @@ -3423,7 +3424,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, struct folio *new_folio; /* order-1 anonymous folio is not supported */ - if (folio_test_anon(folio) && split_order == 1) + if (is_anon && split_order == 1) continue; if (uniform_split && split_order != new_order) continue; @@ -3475,7 +3476,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, if (split_order != new_order && !stop_split) continue; } - if (folio_test_anon(new_folio)) + if (is_anon) mod_mthp_stat(folio_order(new_folio), MTHP_STAT_NR_ANON, 1); } From 092ef3899707ac2a5ad195b96bcfb3c85d97f25c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 21 Oct 2025 21:21:40 +0000 Subject: [PATCH 105/321] mm/huge_memory: update folio stat after successful split The current implementation complicates this process: * It iterates over the resulting new folios. * It uses a flag (@stop_split) to conditionally skip updating the stat for the folio at @split_at during the loop. * It then attempts to update the skipped stat on a subsequent failure path. This logic is unnecessarily hard to follow. This commit refactors the code to update the folio statistics only after a successful split. This makes the logic much cleaner and sets the stage for further simplification of the stat-handling code. Link: https://lkml.kernel.org/r/20251021212142.25766-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: wang lian Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 44 +++++++++++--------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5d95e3462c43..85c472fbcbfa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3404,20 +3404,15 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, const bool is_anon = folio_test_anon(folio); int order = folio_order(folio); int start_order = uniform_split ? new_order : order - 1; - bool stop_split = false; struct folio *next; int split_order; - int ret = 0; - - if (is_anon) - mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); /* * split to new_order one order at a time. For uniform split, * folio is split to new_order directly. */ for (split_order = start_order; - split_order >= new_order && !stop_split; + split_order >= new_order; split_order--) { struct folio *end_folio = folio_next(folio); int old_order = folio_order(folio); @@ -3440,49 +3435,32 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, else { xas_set_order(xas, folio->index, split_order); xas_try_split(xas, folio, old_order); - if (xas_error(xas)) { - ret = xas_error(xas); - stop_split = true; - } + if (xas_error(xas)) + return xas_error(xas); } } - if (!stop_split) { - folio_split_memcg_refs(folio, old_order, split_order); - split_page_owner(&folio->page, old_order, split_order); - pgalloc_tag_split(folio, old_order, split_order); - - __split_folio_to_order(folio, old_order, split_order); - } + folio_split_memcg_refs(folio, old_order, split_order); + split_page_owner(&folio->page, old_order, split_order); + pgalloc_tag_split(folio, old_order, split_order); + __split_folio_to_order(folio, old_order, split_order); + if (is_anon) + mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1); /* * Iterate through after-split folios and update folio stats. - * But in buddy allocator like split, the folio - * containing the specified page is skipped until its order - * is new_order, since the folio will be worked on in next - * iteration. */ for (new_folio = folio; new_folio != end_folio; new_folio = next) { next = folio_next(new_folio); - /* - * for buddy allocator like split, new_folio containing - * @split_at page could be split again, thus do not - * change stats yet. Wait until new_folio's order is - * @new_order or stop_split is set to true by the above - * xas_split() failure. - */ - if (new_folio == page_folio(split_at)) { + if (new_folio == page_folio(split_at)) folio = new_folio; - if (split_order != new_order && !stop_split) - continue; - } if (is_anon) mod_mthp_stat(folio_order(new_folio), MTHP_STAT_NR_ANON, 1); } } - return ret; + return 0; } bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, From fc4f15ee0bcd56f40ad1df723ac9889d7e82f5a0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 21 Oct 2025 21:21:41 +0000 Subject: [PATCH 106/321] mm/huge_memory: optimize and simplify folio stat update after split The loop executed after a successful folio split currently has two combined responsibilities: * updating statistics for the new folios * determining the folio for the next split iteration. This commit refactors the logic to directly calculate and update folio statistics, eliminating the need for the iteration step. We can do this because all necessary information is already available: * All resulting new folios have the same order, which is @split_order. * The exact number of new folios can be calculated directly using @old_order and @split_order. * The folio for the subsequent split is simply the one containing @split_at. By leveraging this knowledge, we can achieve the stat update more cleanly and efficiently without the looping logic. Link: https://lkml.kernel.org/r/20251021212142.25766-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: wang lian Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 85c472fbcbfa..52c4114a17f2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3404,7 +3404,6 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, const bool is_anon = folio_test_anon(folio); int order = folio_order(folio); int start_order = uniform_split ? new_order : order - 1; - struct folio *next; int split_order; /* @@ -3414,9 +3413,8 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, for (split_order = start_order; split_order >= new_order; split_order--) { - struct folio *end_folio = folio_next(folio); int old_order = folio_order(folio); - struct folio *new_folio; + int nr_new_folios = 1UL << (old_order - split_order); /* order-1 anonymous folio is not supported */ if (is_anon && split_order == 1) @@ -3445,19 +3443,11 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, pgalloc_tag_split(folio, old_order, split_order); __split_folio_to_order(folio, old_order, split_order); - if (is_anon) + if (is_anon) { mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1); - /* - * Iterate through after-split folios and update folio stats. - */ - for (new_folio = folio; new_folio != end_folio; new_folio = next) { - next = folio_next(new_folio); - if (new_folio == page_folio(split_at)) - folio = new_folio; - if (is_anon) - mod_mthp_stat(folio_order(new_folio), - MTHP_STAT_NR_ANON, 1); + mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios); } + folio = page_folio(split_at); } return 0; From f0b1602871f8506c5aa704e9641b004f21918759 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 21 Oct 2025 21:21:42 +0000 Subject: [PATCH 107/321] mm/huge_memory: optimize old_order derivation during folio splitting Folio splitting requires both the folio's original order (@old_order) and the new target order (@split_order). In the current implementation, @old_order is repeatedly retrieved using folio_order(). However, for every iteration after the first, the folio being split is the result of the previous split, meaning its order is already known to be equal to the previous iteration's @split_order. This commit optimizes the logic: * Instead of calling folio_order(), we now set @old_order directly to the value of @split_order from the previous iteration. This change avoids unnecessary function calls and simplifies the loop setup. Also it removes a check for non-existent case, since for uniform splitting we only do split when @split_order == @new_order. Link: https://lkml.kernel.org/r/20251021212142.25766-5-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: wang lian Reviewed-by: Lorenzo Stoakes Cc: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 52c4114a17f2..0a521cf9b10a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3402,8 +3402,8 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, struct address_space *mapping, bool uniform_split) { const bool is_anon = folio_test_anon(folio); - int order = folio_order(folio); - int start_order = uniform_split ? new_order : order - 1; + int old_order = folio_order(folio); + int start_order = uniform_split ? new_order : old_order - 1; int split_order; /* @@ -3413,14 +3413,11 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, for (split_order = start_order; split_order >= new_order; split_order--) { - int old_order = folio_order(folio); int nr_new_folios = 1UL << (old_order - split_order); /* order-1 anonymous folio is not supported */ if (is_anon && split_order == 1) continue; - if (uniform_split && split_order != new_order) - continue; if (mapping) { /* @@ -3447,7 +3444,13 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1); mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios); } + /* + * If uniform split, the process is complete. + * If non-uniform, continue splitting the folio at @split_at + * as long as the next @split_order is >= @new_order. + */ folio = page_folio(split_at); + old_order = split_order; } return 0; From 9fb749cd15078c7bdc46e5d45c37493f83323e33 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:34:11 +0800 Subject: [PATCH 108/321] mm, swap: do not perform synchronous discard during allocation Patch series "mm, swap: misc cleanup and bugfix", v2. A few cleanups and a bugfix that are either suitable after the swap table phase I or found during code review. Patch 1 is a bugfix and needs to be included in the stable branch, the rest have no behavioral change. This patch (of 5): Since commit 1b7e90020eb77 ("mm, swap: use percpu cluster as allocation fast path"), swap allocation is protected by a local lock, which means we can't do any sleeping calls during allocation. However, the discard routine is not taken well care of. When the swap allocator failed to find any usable cluster, it would look at the pending discard cluster and try to issue some blocking discards. It may not necessarily sleep, but the cond_resched at the bio layer indicates this is wrong when combined with a local lock. And the bio GFP flag used for discard bio is also wrong (not atomic). It's arguable whether this synchronous discard is helpful at all. In most cases, the async discard is good enough. And the swap allocator is doing very differently at organizing the clusters since the recent change, so it is very rare to see discard clusters piling up. So far, no issues have been observed or reported with typical SSD setups under months of high pressure. This issue was found during my code review. But by hacking the kernel a bit: adding a mdelay(500) in the async discard path, this issue will be observable with WARNING triggered by the wrong GFP and cond_resched in the bio layer for debug builds. So now let's apply a hotfix for this issue: remove the synchronous discard in the swap allocation path. And when order 0 is failing with all cluster list drained on all swap devices, try to do a discard following the swap device priority list. If any discards released some cluster, try the allocation again. This way, we can still avoid OOM due to swap failure if the hardware is very slow and memory pressure is extremely high. This may cause more fragmentation issues if the discarding hardware is really slow. Ideally, we want to discard pending clusters before continuing to iterate the fragment cluster lists. This can be implemented in a cleaner way if we clean up the device list iteration part first. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-0-a709469052e7@tencent.com Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-1-c5b0e1092927@tencent.com Fixes: 1b7e90020eb7 ("mm, swap: use percpu cluster as allocation fast path") Signed-off-by: Kairui Song Acked-by: Nhat Pham Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/swapfile.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index cb2392ed8e0e..33e0bd905c55 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1101,13 +1101,6 @@ new_cluster: goto done; } - /* - * We don't have free cluster but have some clusters in discarding, - * do discard now and reclaim them. - */ - if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) - goto new_cluster; - if (order) goto done; @@ -1394,6 +1387,33 @@ start_over: return false; } +/* + * Discard pending clusters in a synchronized way when under high pressure. + * Return: true if any cluster is discarded. + */ +static bool swap_sync_discard(void) +{ + bool ret = false; + int nid = numa_node_id(); + struct swap_info_struct *si, *next; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + spin_unlock(&swap_avail_lock); + if (get_swap_device_info(si)) { + if (si->flags & SWP_PAGE_DISCARD) + ret = swap_do_scheduled_discard(si); + put_swap_device(si); + } + if (ret) + return true; + spin_lock(&swap_avail_lock); + } + spin_unlock(&swap_avail_lock); + + return false; +} + /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap @@ -1432,11 +1452,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) } } +again: local_lock(&percpu_swap_cluster.lock); if (!swap_alloc_fast(&entry, order)) swap_alloc_slow(&entry, order); local_unlock(&percpu_swap_cluster.lock); + if (unlikely(!order && !entry.val)) { + if (swap_sync_discard()) + goto again; + } + /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ if (mem_cgroup_try_charge_swap(folio, entry)) goto out_free; From e4adea27b97061e25a9db908421bf2bcc31c89eb Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:40 +0800 Subject: [PATCH 109/321] mm, swap: rename helper for setup bad slots The name inc_cluster_info_page is very confusing, as this helper is only used during swapon to mark bad slots. Rename it properly and turn the VM_BUG_ON in it into WARN_ON to expose more potential issues. Swapon is a cold path, so adding more checks should be a good idea. No feature change except new WARN_ON. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-2-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: David Hildenbrand Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 33e0bd905c55..808052319c0b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -751,14 +751,14 @@ static void relocate_cluster(struct swap_info_struct *si, } /* - * The cluster corresponding to page_nr will be used. The cluster will not be - * added to free cluster list and its usage counter will be increased by 1. - * Only used for initialization. + * The cluster corresponding to @offset will be accounted as having one bad + * slot. The cluster will not be added to the free cluster list, and its + * usage counter will be increased by 1. Only used for initialization. */ -static int inc_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, + unsigned long offset) { - unsigned long idx = page_nr / SWAPFILE_CLUSTER; + unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_table *table; struct swap_cluster_info *ci; @@ -772,8 +772,8 @@ static int inc_cluster_info_page(struct swap_info_struct *si, ci->count++; - VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); - VM_BUG_ON(ci->flags); + WARN_ON(ci->count > SWAPFILE_CLUSTER); + WARN_ON(ci->flags); return 0; } @@ -3396,7 +3396,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - err = inc_cluster_info_page(si, cluster_info, 0); + err = swap_cluster_setup_bad_slot(cluster_info, 0); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -3404,12 +3404,12 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (page_nr >= maxpages) continue; - err = inc_cluster_info_page(si, cluster_info, page_nr); + err = swap_cluster_setup_bad_slot(cluster_info, page_nr); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { - err = inc_cluster_info_page(si, cluster_info, i); + err = swap_cluster_setup_bad_slot(cluster_info, i); if (err) goto err; } From a983471cfc454afeba23526ee5d17fd8cdc7876f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:41 +0800 Subject: [PATCH 110/321] mm, swap: cleanup swap entry allocation parameter We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap: use the swap table for the swap cache and switch API"). Before that commit the GFP parameter is already almost identical for all callers, so nothing changed by that commit. Swap table just moved the GFP to lower layer and make it more defined and changes depend on atomic or sleep allocation. Now this parameter is no longer used, just remove it. No behavior change. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swapfile.c | 3 +-- mm/vmscan.c | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..a4b264817735 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -462,7 +462,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +560,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index da1df4270309..e1dc2d8e939c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1617,7 +1617,7 @@ try_split: folio_mark_uptodate(folio); } - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; diff --git a/mm/swapfile.c b/mm/swapfile.c index 808052319c0b..d87b562ae661 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1417,7 +1417,6 @@ static bool swap_sync_discard(void) /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1425,7 +1424,7 @@ static bool swap_sync_discard(void) * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; diff --git a/mm/vmscan.c b/mm/vmscan.c index ecc90517b791..c23c9616052a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1318,7 +1318,7 @@ retry: split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1334,7 +1334,7 @@ retry: } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) + if (folio_alloc_swap(folio)) goto activate_locked_split; } /* From ab61de9b78dda140573fb474af65f0e1ae13ff5b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:42 +0800 Subject: [PATCH 111/321] mm/migrate, swap: drop usage of folio_index This helper was used when swap cache was mixed with page cache. Now they are completely separate from each other, access to the swap cache is all wrapped by the swap_cache_* helpers, which expect the folio's swap entry as a parameter. This helper is no longer used, remove the last redundant user and drop it. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-4-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 4 ++-- mm/swap.h | 21 --------------------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index c0e9f15be2a2..7c5d2efb9d47 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -562,7 +562,7 @@ unlock: static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; @@ -715,7 +715,7 @@ EXPORT_SYMBOL(folio_migrate_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { - XA_STATE(xas, &mapping->i_pages, folio_index(src)); + XA_STATE(xas, &mapping->i_pages, src->index); int rc, expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) diff --git a/mm/swap.h b/mm/swap.h index 8d8efdf1297a..d034c13d8dd2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -445,25 +445,4 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) return 0; } #endif /* CONFIG_SWAP */ - -/** - * folio_index - File index of a folio. - * @folio: The folio. - * - * For a folio which is either in the page cache or the swap cache, - * return its index within the address_space it belongs to. If you know - * the folio is definitely in the page cache, you can look at the folio's - * index directly. - * - * Return: The index (offset in units of pages) of a folio in its file. - */ -static inline pgoff_t folio_index(struct folio *folio) -{ -#ifdef CONFIG_SWAP - if (unlikely(folio_test_swapcache(folio))) - return swp_offset(folio->swap); -#endif - return folio->index; -} - #endif /* _MM_SWAP_H */ From 4fd58b51ef090e7c8d67a2bc98d994616af3c449 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:43 +0800 Subject: [PATCH 112/321] mm, swap: remove redundant argument for isolating a cluster The order argument was introduced by an intermediate commit and was then never used, just remove it. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-5-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Nhat Pham Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d87b562ae661..125d893bb706 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -594,7 +594,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list, int order) + struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; @@ -957,7 +957,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); unsigned long offset; if (!ci) @@ -982,7 +982,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; From 50ca6423643cdb26871970b4f8870b4940b4c498 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:06 -0700 Subject: [PATCH 113/321] mm/damon/core: fix wrong comment of damon_call() return timing Patch series "mm/damon: misc documentation fixups". First three patches fix up issues in the documents, including wrong explanation of a behavior, wrong link, and a contextual typo. Following five patches update documents for not yet documented features and behaviors. This patch (of 8): damon_call() works asynchronously and synchronously for repeat and non-repeat mode requests, respectively. The comment about the behavior is wrong, though. Fix it. The wrong comment was introduced together with the repeat mode, by commit 43df7676e550 ("mm/damon/core: introduce repeat mode damon_call()"). Link: https://lkml.kernel.org/r/20251026182216.118200-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251026182216.118200-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 82546d138a5a..769da97fcb26 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1440,7 +1440,7 @@ bool damon_is_running(struct damon_ctx *ctx) * Ask DAMON worker thread (kdamond) of @ctx to call a function with an * argument data that respectively passed via &damon_call_control->fn and * &damon_call_control->data of @control. If &damon_call_control->repeat of - * @control is set, further wait until the kdamond finishes handling of the + * @control is unset, further wait until the kdamond finishes handling of the * request. Otherwise, return as soon as the request is made. * * The kdamond executes the function with the argument in the main loop, just From d7484f6edd31a26a1d32aa1e8e20df40dbc1cb7d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:07 -0700 Subject: [PATCH 114/321] Docs/mm/damon/design: fix wrong link to intervals goal section Commit b243d666d107 ("Docs/admin-guide/mm/damon/usage: add intervals_goal directory on the hierarchy") mistakenly added a wrong reference for intervals goal usage documentation on the design document. Fix it. Link: https://lkml.kernel.org/r/20251026182216.118200-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index b54925ea78e9..2d8d8ca1e0a3 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -381,8 +381,8 @@ That is, assumes 4% (20% of 20%) DAMON-observed access events ratio (source) to capture 64% (80% multipled by 80%) real access events (outcomes). To know how user-space can use this feature via :ref:`DAMON sysfs interface -`, refer to :ref:`intervals_goal ` part of -the documentation. +`, refer to :ref:`intervals_goal +` part of the documentation. .. _damon_design_damos: From a01386c16dc2d4ef3284c415cfe414f8137e0ea1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:08 -0700 Subject: [PATCH 115/321] Docs/admin-guide/mm/damon/stat: fix a typo: s/sampling events/sampling interval/ It is a contextual typo. Fix it. Link: https://lkml.kernel.org/r/20251026182216.118200-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/stat.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/stat.rst b/Documentation/admin-guide/mm/damon/stat.rst index 4c517c2c219a..20f540a9d3d2 100644 --- a/Documentation/admin-guide/mm/damon/stat.rst +++ b/Documentation/admin-guide/mm/damon/stat.rst @@ -17,7 +17,7 @@ DAMON_STAT uses monitoring intervals :ref:`auto-tuning ` to make its accuracy high and overhead minimum. It auto-tunes the intervals aiming 4 % of observable access events to be captured in each snapshot, while limiting the resulting sampling -events to be 5 milliseconds in minimum and 10 seconds in maximum. On a few +interval to be 5 milliseconds in minimum and 10 seconds in maximum. On a few production server systems, it resulted in consuming only 0.x % single CPU time, while capturing reasonable quality of access patterns. From 29221406f09d8c4b523ffea86c5c69f628c2f6db Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:09 -0700 Subject: [PATCH 116/321] Docs/admin-guide/mm/damon/usage: document empty target regions commit behavior Committing a monitoring target with empty target regions is for keeping the current monitoring results. This behavior was introduced by commit 973233600676 ("mm/damon/sysfs: update monitoring target regions for online input commit"). The behavior is not documented, though. Update the usage document for clarifying this behavior. Link: https://lkml.kernel.org/r/20251026182216.118200-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 98958975604d..c630f2662695 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -134,7 +134,8 @@ Users can write below commands for the kdamond to the ``state`` file. - ``on``: Start running. - ``off``: Stop running. - ``commit``: Read the user inputs in the sysfs files except ``state`` file - again. + again. Monitoring :ref:`target region ` inputs are also be + ignored if no target region is specified. - ``update_tuned_intervals``: Update the contents of ``sample_us`` and ``aggr_us`` files of the kdamond with the auto-tuning applied ``sampling interval`` and ``aggregation interval`` for the files. Please refer to @@ -289,6 +290,11 @@ In the beginning, this directory has only one file, ``nr_regions``. Writing a number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each initial monitoring target region. +If ``nr_regions`` is zero when committing new DAMON parameters online (writing +``commit`` to ``state`` file of :ref:`kdamond `), the commit +logic ignores the target regions. In other words, the current monitoring +results for the target are preserved. + .. _sysfs_region: regions// From bb01656e003de155f8575483089c66f89f535026 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:10 -0700 Subject: [PATCH 117/321] Docs/admin-guide/mm/damon/reclaim: document addr_unit parameter Commit 7db551fcfb2a ("mm/damon/reclaim: support addr_unit for DAMON_RECLAIM") introduced the 'addr_unit' parameter for DAMON_RECLAIM. But the usage document is not updated for that. Update the document. Link: https://lkml.kernel.org/r/20251026182216.118200-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../admin-guide/mm/damon/reclaim.rst | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index af05ae617018..8eba3da8dcee 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -232,6 +232,28 @@ The end physical address of memory region that DAMON_RECLAIM will do work against. That is, DAMON_RECLAIM will find cold memory regions in this region and reclaims. By default, biggest System RAM is used as the region. +addr_unit +--------- + +A scale factor for memory addresses and bytes. + +This parameter is for setting and getting the :ref:`address unit +` parameter of the DAMON instance for DAMON_RECLAIM. + +``monitor_region_start`` and ``monitor_region_end`` should be provided in this +unit. For example, let's suppose ``addr_unit``, ``monitor_region_start`` and +``monitor_region_end`` are set as ``1024``, ``0`` and ``10``, respectively. +Then DAMON_RECLAIM will work for 10 KiB length of physical address range that +starts from address zero (``[0 * 1024, 10 * 1024)`` in bytes). + +``bytes_reclaim_tried_regions`` and ``bytes_reclaimed_regions`` are also in +this unit. For example, let's suppose values of ``addr_unit``, +``bytes_reclaim_tried_regions`` and ``bytes_reclaimed_regions`` are ``1024``, +``42``, and ``32``, respectively. Then it means DAMON_RECLAIM tried to reclaim +42 KiB memory and successfully reclaimed 32 KiB memory in total. + +If unsure, use only the default value (``1``) and forget about this. + skip_anon --------- From 448666e418bfe456bbc629f978c8b6d6c5027212 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:11 -0700 Subject: [PATCH 118/321] Docs/admin-guide/mm/damon/lru_sort: document addr_unit parameter Commit 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT") introduced the 'addr_unit' parameter for DAMON_LRU_SORT. But the usage document is not updated for that. Update the document. Link: https://lkml.kernel.org/r/20251026182216.118200-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../admin-guide/mm/damon/lru_sort.rst | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 7b0775d281b4..72a943202676 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -211,6 +211,28 @@ End of target memory region in physical address. The end physical address of memory region that DAMON_LRU_SORT will do work against. By default, biggest System RAM is used as the region. +addr_unit +--------- + +A scale factor for memory addresses and bytes. + +This parameter is for setting and getting the :ref:`address unit +` parameter of the DAMON instance for DAMON_RECLAIM. + +``monitor_region_start`` and ``monitor_region_end`` should be provided in this +unit. For example, let's suppose ``addr_unit``, ``monitor_region_start`` and +``monitor_region_end`` are set as ``1024``, ``0`` and ``10``, respectively. +Then DAMON_LRU_SORT will work for 10 KiB length of physical address range that +starts from address zero (``[0 * 1024, 10 * 1024)`` in bytes). + +Stat parameters having ``bytes_`` prefix are also in this unit. For example, +let's suppose values of ``addr_unit``, ``bytes_lru_sort_tried_hot_regions`` and +``bytes_lru_sorted_hot_regions`` are ``1024``, ``42``, and ``32``, +respectively. Then it means DAMON_LRU_SORT tried to LRU-sort 42 KiB of hot +memory and successfully LRU-sorted 32 KiB of the memory in total. + +If unsure, use only the default value (``1``) and forget about this. + kdamond_pid ----------- From da8644a476f5af7d282e304daf43f5c78c0acf13 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:12 -0700 Subject: [PATCH 119/321] Docs/admin-guide/mm/damon/stat: document aggr_interval_us parameter Commit cc7ceb1d14b0 ("mm/damon/stat: expose the current tuned aggregation interval"), has introduced 'aggr_interval_us' parameter for DAMON_STAT. But the new parameter is not yet documented. Document it on the usage document for the module. Link: https://lkml.kernel.org/r/20251026182216.118200-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/stat.rst | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/stat.rst b/Documentation/admin-guide/mm/damon/stat.rst index 20f540a9d3d2..754f98d47617 100644 --- a/Documentation/admin-guide/mm/damon/stat.rst +++ b/Documentation/admin-guide/mm/damon/stat.rst @@ -10,6 +10,8 @@ on the system's entire physical memory using DAMON, and provides simplified access monitoring results statistics, namely idle time percentiles and estimated memory bandwidth. +.. _damon_stat_monitoring_accuracy_overhead: + Monitoring Accuracy and Overhead ================================ @@ -19,7 +21,9 @@ overhead minimum. It auto-tunes the intervals aiming 4 % of observable access events to be captured in each snapshot, while limiting the resulting sampling interval to be 5 milliseconds in minimum and 10 seconds in maximum. On a few production server systems, it resulted in consuming only 0.x % single CPU time, -while capturing reasonable quality of access patterns. +while capturing reasonable quality of access patterns. The tuning-resulting +intervals can be retrieved via ``aggr_interval_us`` :ref:`parameter +`. Interface: Module Parameters ============================ @@ -41,6 +45,18 @@ You can enable DAMON_STAT by setting the value of this parameter as ``Y``. Setting it as ``N`` disables DAMON_STAT. The default value is set by ``CONFIG_DAMON_STAT_ENABLED_DEFAULT`` build config option. +.. _damon_stat_aggr_interval_us: + +aggr_interval_us +---------------- + +Auto-tuned aggregation time interval in microseconds. + +Users can read the aggregation interval of DAMON that is being used by the +DAMON instance for DAMON_STAT. It is :ref:`auto-tuned +` and therefore the value is +dynamically changed. + estimated_memory_bandwidth -------------------------- From f46dbea0d95668dbbaf0aaa5f2c3aabf8f4d8fda Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Oct 2025 11:22:13 -0700 Subject: [PATCH 120/321] Docs/admin-guide/mm/damon/stat: document negative idle time Commit a983a26d5298 ("mm/damon/stat: expose negative idle time") introduced the negative idle time feature for DAMON_STAT. But it is not documented. Document it on the usage document. Link: https://lkml.kernel.org/r/20251026182216.118200-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/stat.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/stat.rst b/Documentation/admin-guide/mm/damon/stat.rst index 754f98d47617..e5a5a2c4f803 100644 --- a/Documentation/admin-guide/mm/damon/stat.rst +++ b/Documentation/admin-guide/mm/damon/stat.rst @@ -74,12 +74,13 @@ memory_idle_ms_percentiles Per-byte idle time (milliseconds) percentiles of the system. DAMON_STAT calculates how long each byte of the memory was not accessed until -now (idle time), based on the current DAMON results snapshot. If DAMON found a -region of access frequency (nr_accesses) larger than zero, every byte of the -region gets zero idle time. If a region has zero access frequency -(nr_accesses), how long the region was keeping the zero access frequency (age) -becomes the idle time of every byte of the region. Then, DAMON_STAT exposes -the percentiles of the idle time values via this read-only parameter. Reading -the parameter returns 101 idle time values in milliseconds, separated by comma. +now (idle time), based on the current DAMON results snapshot. For regions +having access frequency (nr_accesses) larger than zero, how long the current +access frequency level was kept multiplied by ``-1`` becomes the idlee time of +every byte of the region. If a region has zero access frequency (nr_accesses), +how long the region was keeping the zero access frequency (age) becomes the +idle time of every byte of the region. Then, DAMON_STAT exposes the +percentiles of the idle time values via this read-only parameter. Reading the +parameter returns 101 idle time values in milliseconds, separated by comma. Each value represents 0-th, 1st, 2nd, 3rd, ..., 99th and 100th percentile idle times. From e97d7c5165227e6e4423b4a869d7805b01706392 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Thu, 23 Oct 2025 18:12:02 +0000 Subject: [PATCH 121/321] mm: shmem/tmpfs hugepage defaults config choice Allow to override defaults for shemem and tmpfs at config time. This is consistent with how transparent hugepages can be configured. Same results can be achieved with the existing 'transparent_hugepage_shmem' and 'transparent_hugepage_tmpfs' settings in the kernel command line, but it is more convenient to define basic settings at config time instead of changing kernel command line later. Defaults for shmem and tmpfs were not changed. They are remained the same as before: 'never' for both cases. Options 'deny' and 'force' are omitted intentionally since these are special values and supposed to be used for emergencies or testing and are not expected to be permanent ones. Primary motivation for adding config option is to enable policy enforcement at build time. In large-scale production environments (Meta's for example), the kernel configuration is often maintained centrally close to the kernel code itself and owned by the kernel engineers, while boot parameters are managed independently (e.g. by provisioning systems). In such setups, the kernel build defines the supported and expected behavior in a single place, but there is no reliable or uniform control over the kernel command line options. A build-time default allows kernel integrators to enforce a predictable hugepage policy for shmem/tmpfs on a base layer, ensuring reproducible behavior and avoiding configuration drift caused by possible boot-time differences. In short, primary benefit is mostly operational: it provides a way to codify preferred policy in the kernel configuration, which is versioned, reviewed, and tested as part of the kernel build process, rather than depending on potentially variable boot parameters. [d@ilvokhin.com: v2] Link: https://lkml.kernel.org/r/aQECPpjd-fU_TC79@shell.ilvokhin.com Link: https://lkml.kernel.org/r/aPpv8sAa2sYgNu3L@shell.ilvokhin.com Signed-off-by: Dmitry Ilvokhin Reviewed-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Baolin Wang Acked-by: Kiryl Shutsemau Cc: David Hildenbrand Cc: Hugh Dickins Cc: Liam Howlett Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 5 ++ mm/Kconfig | 91 ++++++++++++++++++++++ mm/shmem.c | 33 +++++++- 3 files changed, 127 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 1654211cc6cf..5fbc3d89bb07 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -381,6 +381,11 @@ hugepage allocation policy for the tmpfs mount by using the kernel parameter four valid policies for tmpfs (``always``, ``within_size``, ``advise``, ``never``). The tmpfs mount default policy is ``never``. +Additionally, Kconfig options are available to set the default hugepage +policies for shmem (``CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_*``) and tmpfs +(``CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_*``) at build time. Refer to the +Kconfig help for more details. + In the same manner as ``thp_anon`` controls each supported anonymous THP size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem`` has the same format as ``thp_anon``, but also supports the policy diff --git a/mm/Kconfig b/mm/Kconfig index 682a5c39a1a6..eae03b14f7de 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -853,6 +853,97 @@ choice enabled at runtime via sysfs. endchoice +choice + prompt "Shmem hugepage allocation defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER + help + Selects the hugepage allocation policy defaults for + the internal shmem mount. + + The selection made here can be overridden by using the kernel + command line 'transparent_hugepage_shmem=' option. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER + bool "never" + help + Disable hugepage allocation for shmem mount by default. It can + still be enabled with the kernel command line + 'transparent_hugepage_shmem=' option or at runtime via sysfs + knob. Note that madvise(MADV_COLLAPSE) can still cause + transparent huge pages to be obtained even if this mode is + specified. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS + bool "always" + help + Always attempt to allocate hugepage for shmem mount, can + increase the memory footprint of applications without a + guaranteed benefit but it will work automatically for all + applications. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE + bool "within_size" + help + Enable hugepage allocation for shmem mount if the allocation + will be fully within the i_size. This configuration also takes + into account any madvise(MADV_HUGEPAGE) hints that may be + provided by the applications. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE + bool "advise" + help + Enable hugepage allocation for the shmem mount exclusively when + applications supply the madvise(MADV_HUGEPAGE) hint. + This ensures that hugepages are used only in response to explicit + requests from applications. +endchoice + +choice + prompt "Tmpfs hugepage allocation defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER + help + Selects the hugepage allocation policy defaults for + the tmpfs mount. + + The selection made here can be overridden by using the kernel + command line 'transparent_hugepage_tmpfs=' option. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER + bool "never" + help + Disable hugepage allocation for tmpfs mount by default. It can + still be enabled with the kernel command line + 'transparent_hugepage_tmpfs=' option. Note that + madvise(MADV_COLLAPSE) can still cause transparent huge pages + to be obtained even if this mode is specified. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS + bool "always" + help + Always attempt to allocate hugepage for tmpfs mount, can + increase the memory footprint of applications without a + guaranteed benefit but it will work automatically for all + applications. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE + bool "within_size" + help + Enable hugepage allocation for tmpfs mount if the allocation + will be fully within the i_size. This configuration also takes + into account any madvise(MADV_HUGEPAGE) hints that may be + provided by the applications. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE + bool "advise" + help + Enable hugepage allocation for the tmpfs mount exclusively when + applications supply the madvise(MADV_HUGEPAGE) hint. + This ensures that hugepages are used only in response to explicit + requests from applications. +endchoice + config THP_SWAP def_bool y depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT diff --git a/mm/shmem.c b/mm/shmem.c index e1dc2d8e939c..6580f3cd24bb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -570,8 +570,37 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; + +#undef SHMEM_HUGE_DEFAULT + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; + +#undef TMPFS_HUGE_DEFAULT static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, From adf7d6cdd716e1f3826789befc453c961dfafcf2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:25 -0700 Subject: [PATCH 122/321] mm/damon/core: add damon_target->obsolete for pin-point removal Patch series "mm/damon: support pin-point targets removal". DAMON maintains the targets in a list, and allows committing only an entire list of targets having the new parameters. Targets having same index on the lists are treated as matching source and destination targets. If an existing target cannot find a matching one in the sources list, the target is removed. This means that there is no way to remove only a specific monitoring target in the middle of the current targets list. Such pin-point target removal is really needed in some use cases, though. Monitoring access patterns on virtual address spaces of processes that spawned from the same ancestor is one example. If a process of the group is terminated, the user may want to remove the matching DAMON target as soon as possible, to save in-kernel memory usage for the unnecessary target data. The user may also want to do that without turning DAMON off or removing unnecessary targets, to keep the current monitoring results for other active processes. Extend DAMON kernel API and sysfs ABI to support the pin-point removal in the following way. For API, add a new damon_target field, namely 'obsolete'. If the field on parameters commit source target is set, it means the matching destination target is obsolete. Then the parameters commit logic removes the destination target from the existing targets list. For sysfs ABI, add a new file under the target directory, namely 'obsolete_target'. It is connected with the 'obsolete' field of the commit source targets, so internally using the new API. Also add a selftest for the new feature. The related helper scripts for manipulating the sysfs interface and dumping in-kernel DAMON status are also extended for this. Note that the selftest part was initially posted as an individual RFC series [1], but now merged into this one. Bijan Tabatabai has originally reported this issue, and participated in this solution design on a GitHub issue [1] for DAMON user-space tool. This patch (of 9): DAMON's monitoring targets parameters update function, damon_commit_targets(), is not providing a way to remove a target in the middle of the existing targets list. Extend the API by adding a field to struct damon_target. If the field of a damon_commit_targets() source target is set, it indicates the matching target on the existing targets list is obsolete. damon_commit_targets() understands that and removes those from the list, while respecting the index based matching for other non-obsolete targets. Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org Link: https://github.com/damonitor/damo/issues/36 [1] Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 9ee026c2db53..f3566b978cdf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index 769da97fcb26..06ad359024ad 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -479,6 +479,7 @@ struct damon_target *damon_new_target(void) t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); INIT_LIST_HEAD(&t->list); + t->obsolete = false; return t; } @@ -1187,7 +1188,11 @@ static int damon_commit_targets( damon_for_each_target_safe(dst_target, next, dst) { src_target = damon_nth_target(i++, src); - if (src_target) { + /* + * If src target is obsolete, do not commit the parameters to + * the dst target, and further remove the dst target. + */ + if (src_target && !src_target->obsolete) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), @@ -1210,6 +1215,9 @@ static int damon_commit_targets( damon_for_each_target_safe(src_target, next, src) { if (j++ < i) continue; + /* target to remove has no matching dst */ + if (src_target->obsolete) + return -EINVAL; new_target = damon_new_target(); if (!new_target) return -ENOMEM; From 60bd24f272d00302a98cd8cb4cf3ab8189c82193 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:26 -0700 Subject: [PATCH 123/321] mm/damon/sysfs: test commit input against realistic destination DAMON sysfs interface tests if given online parameters update request is valid, by committing those using the DAMON kernel API, to a test-purpose destination context. The test-purpose destination context is constructed using damon_new_ctx(), so it has no target, no scheme. If a source target has the obsolete field set, the test-purpose commit will fail because damon_commit_targets() fails when there is a source obsolete target that cannot find its matching destination target. DAMON sysfs interface is not letting users set the field for now, so there is no problem. However, the following commit will support that. Also there could be similar future changes that making commit fails based on current context structure. Make the test purpose commit destination context similar to the current running one, by committing the running one to the test purpose context, before doing the real test-purpose commit. Link: https://lkml.kernel.org/r/20251023012535.69625-3-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 0ecd8fb84101..92b53b7ca144 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1451,6 +1451,26 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, static struct damon_ctx *damon_sysfs_build_ctx( struct damon_sysfs_context *sys_ctx); +/* + * Return a new damon_ctx for testing new parameters to commit. + */ +static struct damon_ctx *damon_sysfs_new_test_ctx( + struct damon_ctx *running_ctx) +{ + struct damon_ctx *test_ctx; + int err; + + test_ctx = damon_new_ctx(); + if (!test_ctx) + return NULL; + err = damon_commit_ctx(test_ctx, running_ctx); + if (err) { + damon_destroy_ctx(test_ctx); + return NULL; + } + return test_ctx; +} + /* * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. * @kdamond: The kobject wrapper for the associated kdamond. @@ -1472,7 +1492,7 @@ static int damon_sysfs_commit_input(void *data) param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); - test_ctx = damon_new_ctx(); + test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx); if (!test_ctx) return -ENOMEM; err = damon_commit_ctx(test_ctx, param_ctx); From e35afdf228ccaaafe3baf8034429448ae505be47 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:27 -0700 Subject: [PATCH 124/321] mm/damon/sysfs: implement obsolete_target file There is no good way to remove DAMON targets in the middle of the existing targets list. It restricts efficient and flexible DAMON use cases. Improve the usability by implementing a new DAMON sysfs interface file, namely obsolete_target, under each target directory. It is connected to the obsolete field of parameters commit-source targets, so allows removing arbitrary targets in the middle of existing targets list. Note that the sysfs files are not automatically updated. For example, let's suppose there are three targets in the running context, and a user removes the third target using this feature. If the user writes 'commit' to the kdamond 'state' file again, DAMON sysfs interface will again try to remove the third target. But because there is no matching target in the running context, the commit will fail. It is the user's responsibility to understand resulting DAMON internal targets list change, and construct sysfs files (using nr_targets and other sysfs files) to correctly represent it. Also note that this is arguably an improvement rather than a fix of broken things. Link: https://lkml.kernel.org/r/20251023012535.69625-4-sj@kernel.org Signed-off-by: SeongJae Park Reported-by: Bijan Tabatabai Closes: https://github.com/damonitor/damo/issues/36 Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 92b53b7ca144..e2bd2d7becdd 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -212,6 +212,7 @@ struct damon_sysfs_target { struct kobject kobj; struct damon_sysfs_regions *regions; int pid; + bool obsolete; }; static struct damon_sysfs_target *damon_sysfs_target_alloc(void) @@ -263,6 +264,29 @@ static ssize_t pid_target_store(struct kobject *kobj, return count; } +static ssize_t obsolete_target_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + + return sysfs_emit(buf, "%c\n", target->obsolete ? 'Y' : 'N'); +} + +static ssize_t obsolete_target_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + bool obsolete; + int err = kstrtobool(buf, &obsolete); + + if (err) + return err; + target->obsolete = obsolete; + return count; +} + static void damon_sysfs_target_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_target, kobj)); @@ -271,8 +295,12 @@ static void damon_sysfs_target_release(struct kobject *kobj) static struct kobj_attribute damon_sysfs_target_pid_attr = __ATTR_RW_MODE(pid_target, 0600); +static struct kobj_attribute damon_sysfs_target_obsolete_attr = + __ATTR_RW_MODE(obsolete_target, 0600); + static struct attribute *damon_sysfs_target_attrs[] = { &damon_sysfs_target_pid_attr.attr, + &damon_sysfs_target_obsolete_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_target); @@ -1377,6 +1405,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } + t->obsolete = sys_target->obsolete; return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } From e06469cdf1fdb0d842e5fcaaddfefae3a31e26d9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:28 -0700 Subject: [PATCH 125/321] Docs/admin-guide/mm/damon/usage: document obsolete_target file Document the newly added obsolete_target DAMON sysfs file. Link: https://lkml.kernel.org/r/20251023012535.69625-5-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index c630f2662695..9991dad60fcf 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -67,7 +67,7 @@ comma (","). │ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us │ │ │ │ │ │ nr_regions/min,max │ │ │ │ │ :ref:`targets `/nr_targets - │ │ │ │ │ │ :ref:`0 `/pid_target + │ │ │ │ │ │ :ref:`0 `/pid_target,obsolete_target │ │ │ │ │ │ │ :ref:`regions `/nr_regions │ │ │ │ │ │ │ │ :ref:`0 `/start,end │ │ │ │ │ │ │ │ ... @@ -265,13 +265,20 @@ to ``N-1``. Each directory represents each monitoring target. targets// ------------ -In each target directory, one file (``pid_target``) and one directory -(``regions``) exist. +In each target directory, two files (``pid_target`` and ``obsolete_target``) +and one directory (``regions``) exist. If you wrote ``vaddr`` to the ``contexts//operations``, each target should be a process. You can specify the process to DAMON by writing the pid of the process to the ``pid_target`` file. +Users can selectively remove targets in the middle of the targets array by +writing non-zero value to ``obsolete_target`` file and committing it (writing +``commit`` to ``state`` file). DAMON will remove the matching targets from its +internal targets array. Users are responsible to construct target directories +again, so that those correctly represent the changed internal targets array. + + .. _sysfs_regions: targets//regions From 9abe8d05192846c76f41c9187fac1b800c013b04 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:29 -0700 Subject: [PATCH 126/321] Docs/ABI/damon: document obsolete_target sysfs file Update DAMON ABI document for the newly added obsolete_target DAMON sysfs file. Link: https://lkml.kernel.org/r/20251023012535.69625-6-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index dce6c2cda4e8..4fb8b7a6d625 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -164,6 +164,13 @@ Description: Writing to and reading from this file sets and gets the pid of the target process if the context is for virtual address spaces monitoring, respectively. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets//obsolete_target +Date: Oct 2025 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + obsoleteness of the matching parameters commit destination + target. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets//regions/nr_regions Date: Mar 2022 Contact: SeongJae Park From badfa4361cb116fd9af71aaa2ea470236a8aa25b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:30 -0700 Subject: [PATCH 127/321] selftests/damon/_damon_sysfs: support obsolete_target file A DAMON sysfs file, namely obsolete_target, has been newly introduced. Add a support of that file to _damon_sysfs.py so that DAMON selftests for the file can be easily written. Link: https://lkml.kernel.org/r/20251023012535.69625-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index a0e6290833fb..748778b563cd 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -475,12 +475,14 @@ class Damos: class DamonTarget: pid = None + obsolete = None # todo: Support target regions if test is made idx = None context = None - def __init__(self, pid): + def __init__(self, pid, obsolete=False): self.pid = pid + self.obsolete = obsolete def sysfs_dir(self): return os.path.join( @@ -491,8 +493,13 @@ class DamonTarget: os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') if err is not None: return err - return write_file( + err = write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) + if err is not None: + return err + return write_file( + os.path.join(self.sysfs_dir(), 'obsolete_target'), + 'Y' if self.obsolete else 'N') class IntervalsGoal: access_bp = None From a00f18abef3750167eef7aa9ecd96da96f74fc3f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:31 -0700 Subject: [PATCH 128/321] drgn_dump_damon_status: dump damon_target->obsolete A new field of damon_target for pin-point target removal, namely obsolete, has newly been added. Extend drgn_dump_damon_status.py to dump it, for easily writing a future DAMON selftests of it. Link: https://lkml.kernel.org/r/20251023012535.69625-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/drgn_dump_damon_status.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index 7233369a3a44..cb4fdbe68acb 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -73,6 +73,7 @@ def target_to_dict(target): ['pid', int], ['nr_regions', int], ['regions_list', regions_to_list], + ['obsolete', bool], ]) def targets_to_list(targets): From 65a9033db722bef4226b41fc205dab304c3fec70 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:32 -0700 Subject: [PATCH 129/321] sysfs.py: extend assert_ctx_committed() for monitoring targets assert_ctx_committed() is not asserting monitoring targets commitment, since all existing callers of the function assume no target changes. Extend it for future usage. Link: https://lkml.kernel.org/r/20251023012535.69625-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 2666c6f0f1a5..fd8d3698326e 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -164,6 +164,16 @@ def assert_monitoring_attrs_committed(attrs, dump): assert_true(dump['max_nr_regions'] == attrs.max_nr_regions, 'max_nr_regions', dump) +def assert_monitoring_target_committed(target, dump): + # target.pid is the pid "number", while dump['pid'] is 'struct pid' + # pointer, and hence cannot be compared. + assert_true(dump['obsolete'] == target.obsolete, 'target obsolete', dump) + +def assert_monitoring_targets_committed(targets, dump): + assert_true(len(targets) == len(dump), 'len_targets', dump) + for idx, target in enumerate(targets): + assert_monitoring_target_committed(target, dump[idx]) + def assert_ctx_committed(ctx, dump): ops_val = { 'vaddr': 0, @@ -172,6 +182,7 @@ def assert_ctx_committed(ctx, dump): } assert_true(dump['ops']['id'] == ops_val[ctx.ops], 'ops_id', dump) assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs']) + assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) def assert_ctxs_committed(ctxs, dump): From 809ba69f9f4d17f9c04f4aaba5e680fdd71975dd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:33 -0700 Subject: [PATCH 130/321] selftests/damon/sysfs: add obsolete_target test A new DAMON sysfs file for pin-point target removal, namely obsolete_target, has been added. Add a test for the functionality. It starts DAMON with three monitoring target processes, mark one in the middle as obsolete, commit it, and confirm the internal DAMON status is updated to remove the target in the middle. Link: https://lkml.kernel.org/r/20251023012535.69625-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index fd8d3698326e..b34aea0a6775 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -279,5 +279,42 @@ def main(): kdamonds.stop() + # test obsolete_target. + proc1 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc2 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc3 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + kdamonds = _damon_sysfs.Kdamonds( + [_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[ + _damon_sysfs.DamonTarget(pid=proc1.pid), + _damon_sysfs.DamonTarget(pid=proc2.pid), + _damon_sysfs.DamonTarget(pid=proc3.pid), + ], + schemes=[_damon_sysfs.Damos()], + )])]) + err = kdamonds.start() + if err is not None: + print('kdamond start failed: %s' % err) + exit(1) + kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True + kdamonds.kdamonds[0].commit() + + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print(err) + kdamonds.stop() + exit(1) + + del kdamonds.kdamonds[0].contexts[0].targets[1] + + assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + + kdamonds.stop() + if __name__ == '__main__': main() From 5e0fa7ed984d1c3b8bda4158e35a46cb2b0badac Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 24 Oct 2025 20:30:46 +0900 Subject: [PATCH 131/321] MAINTAINERS: add include/linux/pgalloc.h to MM CORE section Patch series "mm: MISC follow-up patches for linux/pgalloc.h", v2. This is a follow-up patch series for the patch series named: "[PATCH V5 mm-hotfixes 0/3] mm, x86: fix crash due to missing page table sync and make it harder to miss". This patch (of 2): Since include/linux/pgtable.h is already listed in the MM CORE section, add it to the section as well to keep it maintained by the appropriate maintainers. Link: https://lkml.kernel.org/r/20251024113047.119058-1-harry.yoo@oracle.com Link: https://lkml.kernel.org/r/20251024113047.119058-2-harry.yoo@oracle.com Signed-off-by: Harry Yoo Acked-by: SeongJae Park Acked-by: Lorenzo Stoakes Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c39701eec3fe..2625bc3d53d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16266,6 +16266,7 @@ F: include/linux/mmzone.h F: include/linux/mmdebug.h F: include/linux/mmu_notifier.h F: include/linux/pagewalk.h +F: include/linux/pgalloc.h F: include/linux/pgtable.h F: include/linux/ptdump.h F: include/linux/vmpressure.h From ad8b2e096181bd23a32d8672de107136d0c478e9 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 24 Oct 2025 20:30:47 +0900 Subject: [PATCH 132/321] treewide: include linux/pgalloc.h instead of asm/pgalloc.h For now, including instead of is technically fine unless the .c file calls p*d_populate_kernel() helper functions. But it is a better practice to always include . Include instead of outside arch/. Link: https://lkml.kernel.org/r/20251024113047.119058-3-harry.yoo@oracle.com Signed-off-by: Harry Yoo Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Mike Rapoport (Microsoft) Cc: Liam Howlett Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: SeongJae Park Signed-off-by: Andrew Morton --- drivers/firmware/efi/arm-runtime.c | 4 ++-- drivers/firmware/efi/riscv-runtime.c | 4 ++-- drivers/s390/char/sclp_sd.c | 3 +-- fs/dax.c | 2 +- kernel/fork.c | 4 ++-- mm/debug_vm_pgtable.c | 2 +- mm/filemap.c | 3 ++- mm/huge_memory.c | 2 +- mm/hugetlb.c | 2 +- mm/hugetlb_vmemmap.c | 3 ++- mm/khugepaged.c | 2 +- mm/memory.c | 4 ++-- mm/mmu_gather.c | 2 +- mm/mremap.c | 2 +- mm/pgtable-generic.c | 3 ++- mm/pt_reclaim.c | 3 ++- 16 files changed, 24 insertions(+), 21 deletions(-) diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 83092d93f36a..53a5336cde5a 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -12,18 +12,18 @@ #include #include #include +#include +#include #include #include #include #include #include #include -#include #include #include #include -#include #if defined(CONFIG_PTDUMP_DEBUGFS) || defined(CONFIG_ARM_PTDUMP_DEBUGFS) #include diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c index fa71cd898120..7ceb02bc57f7 100644 --- a/drivers/firmware/efi/riscv-runtime.c +++ b/drivers/firmware/efi/riscv-runtime.c @@ -14,18 +14,18 @@ #include #include #include +#include +#include #include #include #include #include #include #include -#include #include #include #include -#include static bool __init efi_virtmap_init(void) { diff --git a/drivers/s390/char/sclp_sd.c b/drivers/s390/char/sclp_sd.c index 129b89fe40a3..7a791cb35aea 100644 --- a/drivers/s390/char/sclp_sd.c +++ b/drivers/s390/char/sclp_sd.c @@ -17,8 +17,7 @@ #include #include #include - -#include +#include #include "sclp.h" diff --git a/fs/dax.c b/fs/dax.c index 516f995a988c..0e766aec4303 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #define CREATE_TRACE_POINTS #include diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..dd0bb5fe4305 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -106,9 +106,9 @@ #include #include #include - -#include +#include #include + #include #include #include diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 133543ca2821..055e0e025b42 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -30,9 +30,9 @@ #include #include #include +#include #include -#include #include /* diff --git a/mm/filemap.c b/mm/filemap.c index 526ad8c92250..ff75bd89b68c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -48,7 +48,8 @@ #include #include #include -#include +#include + #include #include "internal.h" diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0a521cf9b10a..d716c6965e27 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,11 +37,11 @@ #include #include #include +#include #include #include #include -#include #include "internal.h" #include "swap.h" diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 86e672fcb305..1ea459723cce 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,9 +39,9 @@ #include #include #include +#include #include -#include #include #include diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 96ee2bd16ee1..9d01f883fd71 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -15,7 +15,8 @@ #include #include #include -#include +#include + #include #include "hugetlb_vmemmap.h" diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 643abf4be236..f6a92958157d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -21,9 +21,9 @@ #include #include #include +#include #include -#include #include "internal.h" #include "mm_slot.h" diff --git a/mm/memory.c b/mm/memory.c index 8e02b8d75535..8d8c36adafa8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -76,13 +76,13 @@ #include #include #include +#include +#include #include #include #include -#include -#include #include #include diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 374aa6f021c6..247e3f9db6c7 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -9,8 +9,8 @@ #include #include #include +#include -#include #include #ifndef CONFIG_MMU_GATHER_NO_GATHER diff --git a/mm/mremap.c b/mm/mremap.c index 419a0ea0a870..8ad06cf50783 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,10 +25,10 @@ #include #include #include +#include #include #include -#include #include "internal.h" diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 8c22be79b734..e46f0cf2159c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -14,7 +14,8 @@ #include #include #include -#include +#include + #include /* diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c index 7e9455a18aae..0d9cfbf4fe5d 100644 --- a/mm/pt_reclaim.c +++ b/mm/pt_reclaim.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include + #include -#include #include "internal.h" From b734b9d973ccd7ad1cfebc2e1f7db693824a37ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 24 Oct 2025 10:09:02 +0100 Subject: [PATCH 133/321] mm/vma: small VMA lock cleanups We declare vma_start_read() as a static function in mm/mmap_lock.c, so there is no need to provide a stub for !CONFIG_PER_VMA_LOCK. __is_vma_write_locked() is declared in a header and should therefore be static inline. Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to make precedence clear. Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..e05da70dc0cb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -281,9 +281,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) From 272239dc8fcb109b9f1ec1a73bb85405dac92eda Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 Oct 2025 03:56:38 +0100 Subject: [PATCH 134/321] mm: make INVALID_PHYS_ADDR a generic macro INVALID_PHYS_ADDR has very similar definitions across the code base. Hence just move that inside header for more generic usage. Also drop the now redundant ones which are no longer required. Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev [s390] Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 -- arch/s390/boot/vmem.c | 1 - drivers/vdpa/vdpa_user/iova_domain.h | 2 -- include/linux/mm.h | 2 ++ kernel/dma/swiotlb.c | 2 -- 5 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..94e29e3574ff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -470,8 +470,6 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, mutex_unlock(&fixmap_lock); } -#define INVALID_PHYS_ADDR (-1ULL) - static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index cea3de4dce8c..fbe64ffdfb96 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -16,7 +16,6 @@ #include "decompressor.h" #include "boot.h" -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) struct ctlreg __bootdata_preserved(s390_invalid_asce); #ifdef CONFIG_PROC_FS diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 775cad5238f3..a923971a64f5 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -17,8 +17,6 @@ #define IOVA_START_PFN 1 -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - #define BOUNCE_MAP_SHIFT 12 #define BOUNCE_MAP_SIZE (1 << BOUNCE_MAP_SHIFT) #define BOUNCE_MAP_MASK (~(BOUNCE_MAP_SIZE - 1)) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6fd9f5aaf30..7bcd9e6fbc3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0d37da3d95b6..a547c7693135 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -61,8 +61,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. From 6af766c86b1c9f075e30ba7bd291c6aa5384c1f6 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 27 Oct 2025 22:18:17 +0800 Subject: [PATCH 135/321] mm: vmstat: output reserved_highatomic and free_highatomic in zoneinfo The nr_free_highatomic is a key factor in calculating watermarks as it affects the free pages count. Adding this metric, along with nr_reserved_highatomic, to /proc/zoneinfo facilitates easier diagnosis memory watermark calculations and memory pressure states. Sample output: cat /proc/zoneinfo ...... pagesets cpu: 0 count: 52069 high: 52675 batch: 63 high_min: 13971 high_max: 62284 vm stats threshold: 10 node_unreclaimable: 0 start_pfn: 4096 reserved_highatomic: 5120 free_highatomic: 2081 Link: https://lkml.kernel.org/r/20251027141818.283587-1-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmstat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index f226942db746..b53b07aa29e8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1852,9 +1852,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n node_unreclaimable: %u" - "\n start_pfn: %lu", + "\n start_pfn: %lu" + "\n reserved_highatomic: %lu" + "\n free_highatomic: %lu", atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, - zone->zone_start_pfn); + zone->zone_start_pfn, + zone->nr_reserved_highatomic, + zone->nr_free_highatomic); seq_putc(m, '\n'); } From 8e689f8ea45ffdae20350246dd37d124d7092c92 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Oct 2025 11:43:07 +0800 Subject: [PATCH 136/321] mm/swap: do not choose swap device according to numa node Patch series "mm/swapfile.c: select swap devices of default priority round robin", v5. Currently, on system with multiple swap devices, swap allocation will select one swap device according to priority. The swap device with the highest priority will be chosen to allocate firstly. People can specify a priority from 0 to 32767 when swapon a swap device, or the system will set it from -2 then downwards by default. Meanwhile, on NUMA system, the swap device with node_id will be considered first on that NUMA node of the node_id. In the current code, an array of plist, swap_avail_heads[nid], is used to organize swap devices on each NUMA node. For each NUMA node, there is a plist organizing all swap devices. The 'prio' value in the plist is the negated value of the device's priority due to plist being sorted from low to high. The swap device owning one node_id will be promoted to the front position on that NUMA node, then other swap devices are put in order of their default priority. E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as swap devices. Current behaviour: their priorities will be(note that -1 is skipped): NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 0B -2 /dev/zram1 partition 16G 0B -3 /dev/zram2 partition 16G 0B -4 /dev/zram3 partition 16G 0B -5 And their positions in the 8 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ zram1 -> zram0 -> zram2 -> zram3 prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ zram2 -> zram0 -> zram1 -> zram3 prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ zram3 -> zram0 -> zram1 -> zram2 prio:1 prio:2 prio:3 prio:4 swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:2 prio:3 prio:4 prio:5 The adjustment for swap device with node_id intended to decrease the pressure of lock contention for one swap device by taking different swap device on different node. The adjustment was introduced in commit a2468cc9bfdf ("swap: choose swap device according to numa node"). However, the adjustment is a little coarse-grained. On the node, the swap device sharing the node's id will always be selected firstly by node's CPUs until exhausted, then next one. And on other nodes where no swap device shares its node id, swap device with priority '-2' will be selected firstly until exhausted, then next with priority '-3'. This is the swapon output during the process high pressure vm-scability test is being taken. It's clearly showing zram0 is heavily exploited until exhausted. =================================== [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 15.7G -2 /dev/zram1 partition 16G 3.4G -3 /dev/zram2 partition 16G 3.4G -4 /dev/zram3 partition 16G 2.6G -5 The node based strategy on selecting swap device is much better then the old way one by one selecting swap device. However it is still unreasonable because swap devices are assumed to have similar accessing speed if no priority is specified when swapon. It's unfair and doesn't make sense just because one swap device is swapped on firstly, its priority will be higher than the one swapped on later. So in this patchset, change is made to select the swap device round robin if default priority. In code, the plist array swap_avail_heads[nid] is replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf. Meanwhile, on top of the revert, further change is taken to make any device w/o specified priority get the same default priority '-1'. Surely, swap device with specified priority are always put foremost, this is not impacted. If you care about their different accessing speed, then use 'swapon -p xx' to deploy priority for your swap devices. New behaviour: swap_avail_list: /* one global available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:1 prio:1 prio:1 This is the swapon output during the process high pressure vm-scability being taken, all is selected round robin: ======================================= [root@hp-dl385g10-03 linux]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 12.6G -1 /dev/zram1 partition 16G 12.6G -1 /dev/zram2 partition 16G 12.6G -1 /dev/zram3 partition 16G 12.6G -1 With the change, we can see about 18% efficiency promotion as below: vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap) Before: After: System time: 637.92 s 526.74 s (lower is better) Sum Throughput: 3546.56 MB/s 4207.56 MB/s (higher is better) Single process Throughput: 114.40 MB/s 135.72 MB/s (higher is better) free latency: 10138455.99 us 6810119.01 us (low is better) This patch (of 2): This reverts commit a2468cc9bfdf ("swap: choose swap device according to numa node"). After this patch, the behaviour will change back to pre-commit a2468cc9bfdf. Means the priority will be set from -1 then downwards by default, and when swapping, it will exhault swap device one by one according to priority from high to low. This is preparation work for later change. [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 16G -1 /dev/zram1 partition 16G 966.2M -2 /dev/zram2 partition 16G 0B -3 /dev/zram3 partition 16G 0B -4 Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Chris Li Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Kairui Song Cc: Barry Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/index.rst | 1 - Documentation/admin-guide/mm/swap_numa.rst | 78 --------------------- include/linux/swap.h | 11 +-- mm/swapfile.c | 80 ++++------------------ 4 files changed, 15 insertions(+), 155 deletions(-) delete mode 100644 Documentation/admin-guide/mm/swap_numa.rst diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index ebc83ca20fdc..bbb563cba5d2 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -39,7 +39,6 @@ the Linux memory management. shrinker_debugfs slab soft-dirty - swap_numa transhuge userfaultfd zswap diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst deleted file mode 100644 index 2e630627bcee..000000000000 --- a/Documentation/admin-guide/mm/swap_numa.rst +++ /dev/null @@ -1,78 +0,0 @@ -=========================================== -Automatically bind swap device to numa node -=========================================== - -If the system has more than one swap device and swap device has the node -information, we can make use of this information to decide which swap -device to use in get_swap_pages() to get better performance. - - -How to use this feature -======================= - -Swap device has priority and that decides the order of it to be used. To make -use of automatically binding, there is no need to manipulate priority settings -for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and -swapB, with swapA attached to node 0 and swapB attached to node 1, are going -to be swapped on. Simply swapping them on by doing:: - - # swapon /dev/swapA - # swapon /dev/swapB - -Then node 0 will use the two swap devices in the order of swapA then swapB and -node 1 will use the two swap devices in the order of swapB then swapA. Note -that the order of them being swapped on doesn't matter. - -A more complex example on a 4 node machine. Assume 6 swap devices are going to -be swapped on: swapA and swapB are attached to node 0, swapC is attached to -node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. -The way to swap them on is the same as above:: - - # swapon /dev/swapA - # swapon /dev/swapB - # swapon /dev/swapC - # swapon /dev/swapD - # swapon /dev/swapE - # swapon /dev/swapF - -Then node 0 will use them in the order of:: - - swapA/swapB -> swapC -> swapD -> swapE -> swapF - -swapA and swapB will be used in a round robin mode before any other swap device. - -node 1 will use them in the order of:: - - swapC -> swapA -> swapB -> swapD -> swapE -> swapF - -node 2 will use them in the order of:: - - swapD/swapE -> swapA -> swapB -> swapC -> swapF - -Similaly, swapD and swapE will be used in a round robin mode before any -other swap devices. - -node 3 will use them in the order of:: - - swapF -> swapA -> swapB -> swapC -> swapD -> swapE - - -Implementation details -====================== - -The current code uses a priority based list, swap_avail_list, to decide -which swap device to use and if multiple swap devices share the same -priority, they are used round robin. This change here replaces the single -global swap_avail_list with a per-numa-node list, i.e. for each numa node, -it sees its own priority based list of available swap devices. Swap -device's priority can be promoted on its matching node's swap_avail_list. - -The current swap device's priority is set as: user can set a >=0 value, -or the system will pick one starting from -1 then downwards. The priority -value in the swap_avail_list is the negated value of the swap device's -due to plist being sorted from low to high. The new policy doesn't change -the semantics for priority >=0 cases, the previous starting from -1 then -downwards now becomes starting from -2 then downwards and -1 is reserved -as the promoted value. So if multiple swap devices are attached to the same -node, they will all be promoted to priority -1 on that node's plist and will -be used round robin before any other swap devices. diff --git a/include/linux/swap.h b/include/linux/swap.h index a4b264817735..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 125d893bb706..ce3580e2f4f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +static int least_priority; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -1130,7 +1130,6 @@ done: /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1159,8 +1158,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1169,7 +1167,6 @@ skip: /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1202,8 +1199,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1346,16 +1342,14 @@ static bool swap_alloc_fast(swp_entry_t *entry, static bool swap_alloc_slow(swp_entry_t *entry, int order) { - int node; unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); @@ -1380,7 +1374,7 @@ start_over: * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&si->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); @@ -1394,11 +1388,10 @@ start_over: static bool swap_sync_discard(void) { bool ret = false; - int nid = numa_node_id(); struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) @@ -2709,25 +2702,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - if (prio >= 0) si->prio = prio; else @@ -2737,16 +2716,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2919,15 +2889,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; - int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } + si->avail_list.prio--; } least_priority++; } @@ -3168,9 +3134,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3209,8 +3174,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3467,9 +3431,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -4079,7 +4040,6 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; - int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; @@ -4098,8 +4058,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, + avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4111,18 +4071,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* From 52f37efc5949058a9c967965ad2b8d4852ea5248 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Oct 2025 11:43:08 +0800 Subject: [PATCH 137/321] mm/swap: select swap device with default priority round robin Swap devices are assumed to have similar accessing speed when swapon if no priority is specified. It's unfair and doesn't make sense just because one swap device is swapped on firstly, its priority will be higher than the one swapped on later. Here, set all swap devicess to have priority '-1' by default. With this change, swap device with default priority will be selected round robin when swapping out. This can improve the swapping efficiency a lot among multiple swap devices with default priority. Below are swapon output during the processes when high pressure vm-scability test is being taken: 1) This is pre-commit a2468cc9bfdf, swap device is selectd one by one by priority from high to low when one swap device is exhausted: ------------------------------------ [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 16G -1 /dev/zram1 partition 16G 966.2M -2 /dev/zram2 partition 16G 0B -3 /dev/zram3 partition 16G 0B -4 2) This is behaviour with commit a2468cc9bfdf, on node, swap device sharing the same node id is selected firstly until exhausted; while on node no swap device sharing the node id it selects the one with highest priority until exhaustd: ------------------------------------ [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 15.7G -2 /dev/zram1 partition 16G 3.4G -3 /dev/zram2 partition 16G 3.4G -4 /dev/zram3 partition 16G 2.6G -5 3) After this patch applied, swap devices with default priority are selectd round robin: ------------------------------------ [root@hp-dl385g10-03 block]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 6.6G -1 /dev/zram1 partition 16G 6.6G -1 /dev/zram2 partition 16G 6.6G -1 /dev/zram3 partition 16G 6.6G -1 With the change, about 18% efficiency promotion relative to node based way as below. (Surely, the pre-commit a2468cc9bfdf way is the worst.) vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap) one by one: node based: round robin: System time: 1087.38 s 637.92 s 526.74 s (lower is better) Sum Throughput: 2036.55 MB/s 3546.56 MB/s 4207.56 MB/s (higher is better) Single process Throughput: 65.69 MB/s 114.40 MB/s 135.72 MB/s (high is better) free latency: 15769409.48 us 10138455.99 us 6810119.01 us(lower is better) Link: https://lkml.kernel.org/r/20251028034308.929550-3-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Chris Li Acked-by: Chris Li Acked-by: Nhat Pham Cc: Barry Song Cc: Kairui Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/swapfile.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index ce3580e2f4f4..c35bb8593f50 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority; +#define DEF_SWAP_PRIO -1 unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -2707,10 +2707,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - if (prio >= 0) - si->prio = prio; - else - si->prio = --least_priority; + si->prio = prio; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low @@ -2728,16 +2725,7 @@ static void _enable_swap_info(struct swap_info_struct *si) total_swap_pages += si->pages; assert_spin_locked(&swap_lock); - /* - * both lists are plists, and thus priority ordered. - * swap_active_head needs to be priority ordered for swapoff(), - * which on removal of any swap_info_struct with an auto-assigned - * (i.e. negative) priority increments the auto-assigned priority - * of any lower-priority swap_info_structs. - * swap_avail_head needs to be priority ordered for folio_alloc_swap(), - * which allocates swap pages from the highest available priority - * swap_info_struct. - */ + plist_add(&si->list, &swap_active_head); /* Add back to available list */ @@ -2887,16 +2875,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } spin_lock(&p->lock); del_from_avail_list(p, true); - if (p->prio < 0) { - struct swap_info_struct *si = p; - - plist_for_each_entry_continue(si, &swap_active_head, list) { - si->prio++; - si->list.prio--; - si->avail_list.prio--; - } - least_priority++; - } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; @@ -3607,7 +3585,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - prio = -1; + prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); From 1a4f70f6851a1916c4f0e52731c7ecfe99bf36e6 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:28 +0000 Subject: [PATCH 138/321] mm: convert memory block states (MEM_*) macros to enum Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2. The MEM_* constants indicating the state of a memory block are currently defined as macros, meaning their definitions will be omitted from the debuginfo on most kernel builds. This makes it harder for debuggers to correctly map the block state at runtime, which can be quite useful when analysing errors related to memory hot plugging and unplugging with tools such as drgn. Converting the constants to an enum ensures the correct information is emitted by the compiler and available for the debugger, without needing to hard-code them into the debugger and track their changes. This patch series aims to replace the current macros with a newly created enum named memory_block_state, while also taking advantage of the compile time guarantees that we get when using enums. The first patch does the conversion of the macros to an enum, while the 2nd and 3rd patches use this enum to clean up some type declarations and make sure that only valid values are used. This patch (of 3): Converting the MEM_* constants from macros to an enum ensures that their values will be correctly emitted in the debug symbols, making it easier to trace the meaning of each value when debugging with tools such as drgn, without the need to hard-code the values. Since the values are mutually exclusive and they are not exposed directly to userspace, I also dropped the misleading pattern (1< Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..f4e358477c6a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,6 +64,18 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, + MEM_PREPARE_ONLINE, + MEM_FINISH_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -89,16 +101,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) - struct memory_notify { /* * The altmap_start_pfn and altmap_nr_pages fields are designated for From 8bc7ba3d265d6ee698de4b1941b7e8f7d91a0562 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:30 +0000 Subject: [PATCH 139/321] mm: change type of state in struct memory_block The state of a memory block should be restricted to values specified in the documentation of the memory hotplug API. However, since the state field in the memory_block struct was defined as an unsigned long, this restriction was not enforced at compile time. With the introduction of the enum memory_block_state, it is now possible to incorporate the desired semantics in the field declaration and enforce these restrictions at compile time. [akpm@linux-foundation.org: fix whitespace, per Randy] Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- include/linux/memory.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6d84a02cfa5d..3d17dd774947 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, break; default: WARN_ON(1); - return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); + return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state); } return sysfs_emit(buf, "%s\n", output); diff --git a/include/linux/memory.h b/include/linux/memory.h index f4e358477c6a..ca20cbdd71f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -78,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* From ed1f8855dd7b82a0ad87960b1729a3e848dc5589 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:32 +0000 Subject: [PATCH 140/321] mm: change type of parameter for memory_notify memory_notify() is responsible for sending events related to memory hotplugging to a notification queue. Since all the events must match one of the values from the enum memory_block_state, it is appropriate to change the function parameter type to make this condition explicit at compile time. Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 3d17dd774947..c03f3b5e5e6f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -204,9 +204,9 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%s\n", output); } -int memory_notify(unsigned long val, void *v) +int memory_notify(enum memory_block_state state, void *v) { - return blocking_notifier_call_chain(&memory_chain, val, v); + return blocking_notifier_call_chain(&memory_chain, state, v); } #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) diff --git a/include/linux/memory.h b/include/linux/memory.h index ca20cbdd71f2..ca3eb1db6cc8 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -141,7 +141,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -165,7 +165,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, From 912aa825957f556a29d781c8f4cb4f4dfd938a9d Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Wed, 5 Nov 2025 15:49:10 -0300 Subject: [PATCH 141/321] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk" Patch series "ksm: perform a range-walk to jump over holes in break_ksm", v4. When unmerging an address range, unmerge_ksm_pages function walks every page address in the specified range to locate ksm pages. This becomes highly inefficient when scanning large virtual memory areas that contain mostly unmapped regions, causing the process to get blocked for several minutes. This patch makes break_ksm, function called by unmerge_ksm_pages for every page in an address range, perform a range walk, allowing it to skip over entire unmapped holes in a VMA, avoiding unnecessary lookups. As pointed out by David Hildenbrand in [1], unmerge_ksm_pages() is called from: * ksm_madvise() through madvise(MADV_UNMERGEABLE). There are not a lot of users of that function. * __ksm_del_vma() through ksm_del_vmas(). Effectively called when disabling KSM for a process either through the sysctl or from s390x gmap code when enabling storage keys for a VM. Consider the following test program which creates a 32 TiB mapping in the virtual address space but only populates a single page: #include #include #include /* 32 TiB */ const size_t size = 32ul * 1024 * 1024 * 1024 * 1024; int main() { char *area = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0); if (area == MAP_FAILED) { perror("mmap() failed\n"); return -1; } /* Populate a single page such that we get an anon_vma. */ *area = 0; /* Enable KSM. */ madvise(area, size, MADV_MERGEABLE); madvise(area, size, MADV_UNMERGEABLE); return 0; } Without this patch, this program takes 9 minutes to finish, while with this patch it finishes in less then 5 seconds. This patch (of 3): This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes function break_ksm_pmd_entry() to use folios. This reverts break_ksm() to use walk_page_range_vma() instead of folio_walk_start(). Change break_ksm_pmd_entry() to call is_ksm_zero_pte() only if we know the folio is present, and also rename variable ret to found. This will make it easier to later modify break_ksm() to perform a proper range walk. Link: https://lkml.kernel.org/r/20251105184912.186329-1-pedrodemargomes@gmail.com Link: https://lkml.kernel.org/r/20251105184912.186329-2-pedrodemargomes@gmail.com Link: https://lore.kernel.org/linux-mm/e0886fdf-d198-4130-bd9a-be276c59da37@redhat.com/ [1] Signed-off-by: Pedro Demarchi Gomes Suggested-by: David Hildenbrand (Red Hat) Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- mm/ksm.c | 64 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 4f672f4f2140..9f74baf01e46 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -607,6 +607,48 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct folio *folio = NULL; + spinlock_t *ptl; + pte_t *pte; + pte_t ptent; + int found; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) + return 0; + ptent = ptep_get(pte); + if (pte_present(ptent)) { + folio = vm_normal_folio(walk->vma, addr, ptent); + } else if (!pte_none(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + folio = pfn_swap_entry_folio(entry); + } + /* return 1 if the page is an normal ksm page or KSM-placed zero page */ + found = (folio && folio_test_ksm(folio)) || + (pte_present(ptent) && is_ksm_zero_pte(ptent)); + pte_unmap_unlock(pte, ptl); + return found; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops break_ksm_lock_vma_ops = { + .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_WRLOCK, +}; + /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. @@ -623,26 +665,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm) static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; - - if (lock_vma) - vma_start_write(vma); + const struct mm_walk_ops *ops = lock_vma ? + &break_ksm_lock_vma_ops : &break_ksm_ops; do { - bool ksm_page = false; - struct folio_walk fw; - struct folio *folio; + int ksm_page; cond_resched(); - folio = folio_walk_start(&fw, vma, addr, - FW_MIGRATION | FW_ZEROPAGE); - if (folio) { - /* Small folio implies FW_LEVEL_PTE. */ - if (!folio_test_large(folio) && - (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) - ksm_page = true; - folio_walk_end(&fw, vma); - } - + ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); + if (WARN_ON_ONCE(ksm_page < 0)) + return ksm_page; if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, From 5d4939fc2258e80e0eda2a8a190c9e4f78f52456 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Wed, 5 Nov 2025 15:49:11 -0300 Subject: [PATCH 142/321] ksm: perform a range-walk in break_ksm Make break_ksm() receive an address range and change break_ksm_pmd_entry() to perform a range-walk and return the address of the first ksm page found. This change allows break_ksm() to skip unmapped regions instead of iterating every page address. When unmerging large sparse VMAs, this significantly reduces runtime. In a benchmark unmerging a 32 TiB sparse virtual address space where only one page was populated, the runtime dropped from 9 minutes to less then 5 seconds. Link: https://lkml.kernel.org/r/20251105184912.186329-3-pedrodemargomes@gmail.com Signed-off-by: Pedro Demarchi Gomes Suggested-by: David Hildenbrand (Red Hat) Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- mm/ksm.c | 83 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9f74baf01e46..43be57a6a3fd 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -607,35 +607,50 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, +static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct folio *folio = NULL; + unsigned long *found_addr = (unsigned long *) walk->private; + struct mm_struct *mm = walk->mm; + pte_t *start_ptep, *ptep; spinlock_t *ptl; - pte_t *pte; - pte_t ptent; - int found; + int found = 0; - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (!pte) + if (ksm_test_exit(walk->mm)) return 0; - ptent = ptep_get(pte); - if (pte_present(ptent)) { - folio = vm_normal_folio(walk->vma, addr, ptent); - } else if (!pte_none(ptent)) { - swp_entry_t entry = pte_to_swp_entry(ptent); + if (signal_pending(current)) + return -ERESTARTSYS; - /* - * As KSM pages remain KSM pages until freed, no need to wait - * here for migration to end. - */ - if (is_migration_entry(entry)) - folio = pfn_swap_entry_folio(entry); + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!start_ptep) + return 0; + + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { + pte_t pte = ptep_get(ptep); + struct folio *folio = NULL; + + if (pte_present(pte)) { + folio = vm_normal_folio(walk->vma, addr, pte); + } else if (!pte_none(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + folio = pfn_swap_entry_folio(entry); + } + /* return 1 if the page is an normal ksm page or KSM-placed zero page */ + found = (folio && folio_test_ksm(folio)) || + (pte_present(pte) && is_ksm_zero_pte(pte)); + if (found) { + *found_addr = addr; + goto out_unlock; + } } - /* return 1 if the page is an normal ksm page or KSM-placed zero page */ - found = (folio && folio_test_ksm(folio)) || - (pte_present(ptent) && is_ksm_zero_pte(ptent)); - pte_unmap_unlock(pte, ptl); +out_unlock: + pte_unmap_unlock(ptep, ptl); return found; } @@ -662,7 +677,8 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = { * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, bool lock_vma) { vm_fault_t ret = 0; const struct mm_walk_ops *ops = lock_vma ? @@ -672,11 +688,9 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v int ksm_page; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); - if (WARN_ON_ONCE(ksm_page < 0)) + ksm_page = walk_page_range_vma(vma, addr, end, ops, &addr); + if (ksm_page <= 0) return ksm_page; - if (!ksm_page) - return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); @@ -762,7 +776,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr, false); + break_ksm(vma, addr, addr + PAGE_SIZE, false); mmap_read_unlock(mm); } @@ -1073,18 +1087,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { - unsigned long addr; - int err = 0; - - for (addr = start; addr < end && !err; addr += PAGE_SIZE) { - if (ksm_test_exit(vma->vm_mm)) - break; - if (signal_pending(current)) - err = -ERESTARTSYS; - else - err = break_ksm(vma, addr, lock_vma); - } - return err; + return break_ksm(vma, start, end, lock_vma); } static inline From 05c3fa9c9fa636b4e5856b0d86c3f194bbc804e4 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Wed, 5 Nov 2025 15:49:12 -0300 Subject: [PATCH 143/321] ksm: replace function unmerge_ksm_pages with break_ksm Function unmerge_ksm_pages() is unnecessary since now break_ksm() walks an address range. So replace it with break_ksm(). Link: https://lkml.kernel.org/r/20251105184912.186329-4-pedrodemargomes@gmail.com Signed-off-by: Pedro Demarchi Gomes Suggested-by: David Hildenbrand (Red Hat) Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- mm/ksm.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 43be57a6a3fd..f9a1a3658ead 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -665,6 +665,18 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = { }; /* + * Though it's very tempting to unmerge rmap_items from stable tree rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + * * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * @@ -1071,25 +1083,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) } } -/* - * Though it's very tempting to unmerge rmap_items from stable tree rather - * than check every pte of a given vma, the locking doesn't quite work for - * that - an rmap_item is assigned to the stable tree after inserting ksm - * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing - * rmap_items from parent to child at fork time (so as not to waste time - * if exit comes before the next scan reaches it). - * - * Similarly, although we'd like to remove rmap_items (so updating counts - * and freeing memory) when unmerging an area, it's easier to leave that - * to the next pass of ksmd - consider, for example, how ksmd might be - * in cmp_and_merge_page on one of the rmap_items we would be removing. - */ -static int unmerge_ksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end, bool lock_vma) -{ - return break_ksm(vma, start, end, lock_vma); -} - static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { @@ -1227,8 +1220,7 @@ static int unmerge_and_remove_all_rmap_items(void) for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; - err = unmerge_ksm_pages(vma, - vma->vm_start, vma->vm_end, false); + err = break_ksm(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } @@ -2855,7 +2847,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) return 0; if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); + err = break_ksm(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } @@ -3007,7 +2999,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; /* just ignore the advice */ if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end, true); + err = break_ksm(vma, start, end, true); if (err) return err; } @@ -3389,7 +3381,7 @@ static int ksm_memory_callback(struct notifier_block *self, * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. - * But unmerge_ksm_pages(), rmap lookups and other entry points + * But break_ksm(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); From 77a7cfd96c17f2414a8319c28a12ff69b36e626a Mon Sep 17 00:00:00 2001 From: Zhang Chujun Date: Mon, 3 Nov 2025 14:59:09 +0800 Subject: [PATCH 144/321] mm/debug: fix missing space in case statement In setup_vm_debug() , the case statement for 'p' option is written as 'case'p':' without a space between 'case' and the character constant. While this is syntactically valid C, it violates the Linux kernel coding style, which requires a space after 'case'. This patch adds the missing space to comply with coding standards. Link: https://lkml.kernel.org/r/20251103065910.2196-1-zhangchujun@cmss.chinamobile.com Signed-off-by: Zhang Chujun Reviewed-by: Dev Jain Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/debug.c b/mm/debug.c index 64ddb0c4b4be..d860864063be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -327,7 +327,7 @@ static int __init setup_vm_debug(char *str) while (*str) { switch (tolower(*str)) { - case'p': + case 'p': __page_init_poisoning = true; break; default: From ee040cbd6e48165ba543c1d0405596283b1514ca Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 5 Nov 2025 16:56:52 +0800 Subject: [PATCH 145/321] mm/page_alloc: don't warn about large allocations with __GFP_NOFAIL Filesystems use __GFP_NOFAIL to allocate block-sized folios for metadata reads at critical points, since they cannot afford to go read-only, shut down, or enter an inconsistent state due to memory pressure. Currently, attempting to allocate page units greater than order-1 with the __GFP_NOFAIL flag triggers a WARN_ON() in __alloc_pages_slowpath(). However, filesystems supporting large block sizes (blocksize > PAGE_SIZE) can easily require allocations larger than order-1. As Matthew Wilcox noted in [1], if we have a filesystem with 64KiB sectors, there will be many clean folios in the page cache that are 64KiB or larger. He also explained in [2] why kvmalloc isn't a valid approach here. With gfp flags and order already included in the OOM report, both Vlastimil Babka and Michal Hocko suggested that we can take the risk of removing this warning first and then observe whether a large number of related OOM reports appear. If that happens, we can consider adding special handling in other places. Link: https://lkml.kernel.org/r/20251105085652.4081123-1-libaokun@huaweicloud.com Signed-off-by: Baokun Li Suggested-by: Matthew Wilcox Link: https://lore.kernel.org/all/aQPX1-XWQjKaMTZB@casper.infradead.org [1] Link: https://lore.kernel.org/all/aQTHMI3t5mNXp0M1@casper.infradead.org [2] Suggested-by: Vlastimil Babka Link: https://lore.kernel.org/all/188a95ba-6384-4319-bb74-c0d9ec6c4079@suse.cz Suggested-by: Michal Hocko Link: https://lore.kernel.org/all/aQotQBjnDDeL_wHx@tiehlicka Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Brendan Jackman Cc: ErKun Yang Cc: Jan Kara Cc: Johannes Weiner Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: "zhangyi (F)" Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb91c566327c..e4efda1158b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4683,11 +4683,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; if (unlikely(nofail)) { - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); /* * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, * otherwise, we may result in lockup. From 30d0a1291046a3641b5d9d547591228ad9c6aae0 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:32 +0000 Subject: [PATCH 146/321] mm: change ghes code to allow poison of non-struct pfn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently handles ECC errors / poison only on memory page backed by struct page. The handling is currently missing for the PFNMAP memory that does not have struct pages. The series adds such support. Implement a new ECC handling for memory without struct pages. Kernel MM expose registration APIs to allow modules that are managing the device to register its device memory region. MM then tracks such regions using interval tree. The mechanism is largely similar to that of ECC on pfn with struct pages. If there is an ECC error on a pfn, all the mapping to it are identified and a SIGBUS is sent to the user space processes owning those mappings. Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. nvgrace-gpu-vfio-pci module maps the device memory to user VA (Qemu) using remap_pfn_range without being added to the kernel [2]. These device memory PFNs are not backed by struct page. So make nvgrace-gpu-vfio-pci module make use of the mechanism to get poison handling support on the device memory. This patch (of 3): The GHES code allows calling of memory_failure() on the PFNs that pass the pfn_valid() check. This contract is broken for the remapped PFNs which fails the check and ghes_do_memory_failure() returns without triggering memory_failure(). Update code to allow memory_failure() call on PFNs failing pfn_valid(). Link: https://lkml.kernel.org/r/20251102184434.2406-1-ankita@nvidia.com Link: https://lkml.kernel.org/r/20251102184434.2406-2-ankita@nvidia.com Signed-off-by: Ankit Agrawal Reviewed-by: Shuai Xue Cc: Aniket Agashe Cc: Ankit Agrawal Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- drivers/acpi/apei/ghes.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 97ee19f2cae0..91f0e23d6214 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -505,12 +505,6 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags) return false; pfn = PHYS_PFN(physical_addr); - if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { - pr_warn_ratelimited(FW_WARN GHES_PFX - "Invalid address in generic error data: %#llx\n", - physical_addr); - return false; - } if (flags == MF_ACTION_REQUIRED && current->mm) { twcb = (void *)gen_pool_alloc(ghes_estatus_pool, sizeof(*twcb)); From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:33 +0000 Subject: [PATCH 147/321] mm: handle poisoning of pfn without struct pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region mapped using remap_pfn_range() for example, but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register the device memory SPA and the address space associated it. MM maintains this information as an interval tree. On poison, MM can search for the range that the poisoned PFN belong and use the address_space to determine the mapping VMA. In this implementation, kernel MM follows the following sequence that is largely similar to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() collects the processes mapped to the PFN. 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the faulty PFN using kill_procs(). Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com Signed-off-by: Ankit Agrawal Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/memory-failure.h | 17 ++++ include/linux/mm.h | 1 + include/ras/ras_event.h | 1 + mm/Kconfig | 1 + mm/memory-failure.c | 145 ++++++++++++++++++++++++++++++++- 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/memory-failure.h diff --git a/MAINTAINERS b/MAINTAINERS index 2625bc3d53d8..5cf6873569d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11557,6 +11557,7 @@ M: Miaohe Lin R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained +F: include/linux/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7bcd9e6fbc3c..b636d12bb651 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4285,6 +4285,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..fecfeb7c8be7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/Kconfig b/mm/Kconfig index eae03b14f7de..d548976d0e0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -741,6 +741,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 560884dd6250..77391b6f9f76 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -885,6 +890,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1277,7 +1283,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); update_per_node_mf_stats(pfn, result); } @@ -2147,6 +2153,135 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pfn(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + unsigned long pfn) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pfn, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pfn, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct address_space *mapping, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { + if (vma->vm_mm == t->mm) + add_to_kill_pfn(t, vma, to_kill, pfn); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2196,6 +2331,14 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); From ebb9aeb980e5d3d8d9505d187005b02942cc1cd9 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:34 +0000 Subject: [PATCH 148/321] vfio/nvgrace-gpu: register device memory for poison handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA (Qemu) using remap_pfn_range() without adding the memory to the kernel. The device memory pages are not backed by struct page. The previous patch implements the mechanism to handle ECC/poison on memory page without struct page. This new mechanism is being used here. The module registers its memory region and the address_space with the kernel MM for ECC handling using the register_pfn_address_space() registration API exposed by the kernel. Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-4-ankita@nvidia.com Signed-off-by: Ankit Agrawal Acked-by: Alex Williamson Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- drivers/vfio/pci/nvgrace-gpu/main.c | 45 ++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..3ce56d039cbe 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -8,6 +8,10 @@ #include #include +#ifdef CONFIG_MEMORY_FAILURE +#include +#endif + /* * The device memory usable to the workloads running in the VM is cached * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) @@ -47,6 +51,9 @@ struct mem_region { void *memaddr; void __iomem *ioaddr; }; /* Base virtual address of the region */ +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; +#endif }; struct nvgrace_gpu_pci_core_device { @@ -60,6 +67,28 @@ struct nvgrace_gpu_pci_core_device { bool has_mig_hw_bug; }; +#ifdef CONFIG_MEMORY_FAILURE + +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct mem_region *region, + struct vm_area_struct *vma) +{ + unsigned long nr_pages; + int ret = 0; + + nr_pages = region->memlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = vma->vm_pgoff; + region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + region->pfn_address_space.mapping = vma->vm_file->f_mapping; + + ret = register_pfn_address_space(®ion->pfn_address_space); + + return ret; +} + +#endif + static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) { struct nvgrace_gpu_pci_core_device *nvdev = @@ -127,6 +156,13 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->remap_lock); +#ifdef CONFIG_MEMORY_FAILURE + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); + + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); +#endif + vfio_pci_core_close_device(core_vdev); } @@ -202,7 +238,14 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_pgoff = start_pfn; - return 0; +#ifdef CONFIG_MEMORY_FAILURE + if (nvdev->resmem.memlength && index == VFIO_PCI_BAR2_REGION_INDEX) + ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma); + else if (index == VFIO_PCI_BAR4_REGION_INDEX) + ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma); +#endif + + return ret; } static long From b5ab490d85b772bc99d2648182a282f39f08feb6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:19:55 -0700 Subject: [PATCH 149/321] mm/damon/tests/core-kunit: fix memory leak in damon_test_set_filters_default_reject() Patch series "mm/damon/tests: fix memory bugs in kunit tests". DAMON kunit tests were initially written assuming those will be run on environments that are well controlled and therefore tolerant to transient test failures and bugs in the test code itself. The user-mode linux based manual run of the tests is one example of such an environment. And the test code was written for adding more test coverage as fast as possible, over making those safe and reliable. As a result, the tests resulted in having a number of bugs including real memory leaks, theoretical unhandled memory allocation failures, and unused memory allocations. The allocation failures that are not handled well are unlikely in the real world, since those allocations are too small to fail. But in theory, it can happen and cause inappropriate memory access. It is arguable if bugs in test code can really harm users. But, anyway bugs are bugs that need to be fixed. Fix the bugs one by one. Also Cc stable@ for the fixes of memory leak and unhandled memory allocation failures. The unused memory allocations are only a matter of memory efficiency, so not Cc-ing stable@. The first patch fixes memory leaks in the test code for the DAMON core layer. Following fifteen, three, and one patches respectively fix unhandled memory allocation failures in the test code for DAMON core layer, virtual address space DAMON operation set, and DAMON sysfs interface, one by one per test function. Final two patches remove memory allocations that are correctly deallocated at the end, but not really being used by any code. This patch (of 22): Kunit test function for damos_set_filters_default_reject() allocates two 'struct damos_filter' objects and not deallocates those, so that the memory for the two objects are leaked for every time the test runs. Fix this by deallocating those objects at the end of the test code. Link: https://lkml.kernel.org/r/20251101182021.74868-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251101182021.74868-2-sj@kernel.org Fixes: 094fb14913c7 ("mm/damon/tests/core-kunit: add a test for damos_set_filters_default_reject()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.16+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 51369e35298b..69ca44f9270b 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -598,6 +598,9 @@ static void damon_test_set_filters_default_reject(struct kunit *test) */ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); + + damos_free_filter(anon_filter); + damos_free_filter(target_filter); } static struct kunit_case damon_test_cases[] = { From e16fdd4f754048d6e23c56bd8d920b71e41e3777 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:19:56 -0700 Subject: [PATCH 150/321] mm/damon/tests/core-kunit: handle allocation failures in damon_test_regions() damon_test_regions() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-3-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 69ca44f9270b..a2c9ee7a5de1 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -20,11 +20,17 @@ static void damon_test_regions(struct kunit *test) struct damon_target *t; r = damon_new_region(1, 2); + if (!r) + kunit_skip(test, "region alloc fail"); KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); t = damon_new_target(); + if (!t) { + damon_free_region(r); + kunit_skip(test, "target alloc fail"); + } KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_add_region(r, t); From fafe953de2c661907c94055a2497c6b8dbfd26f3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:19:57 -0700 Subject: [PATCH 151/321] mm/damon/tests/core-kunit: handle memory failure from damon_test_target() damon_test_target() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-4-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index a2c9ee7a5de1..6e8a605277a3 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -58,7 +58,14 @@ static void damon_test_target(struct kunit *test) struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; + if (!c) + kunit_skip(test, "ctx alloc fail"); + t = damon_new_target(); + if (!t) { + damon_destroy_ctx(c); + kunit_skip(test, "target alloc fail"); + } KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_add_target(c, t); From f79f2fc44ebd0ed655239046be3e80e8804b5545 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:19:58 -0700 Subject: [PATCH 152/321] mm/damon/tests/core-kunit: handle memory alloc failure from damon_test_aggregate() damon_test_aggregate() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-5-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 6e8a605277a3..fd1e1ecaa2c9 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -97,8 +97,15 @@ static void damon_test_aggregate(struct kunit *test) struct damon_region *r; int it, ir; + if (!ctx) + kunit_skip(test, "ctx alloc fail"); + for (it = 0; it < 3; it++) { t = damon_new_target(); + if (!t) { + damon_destroy_ctx(ctx); + kunit_skip(test, "target alloc fail"); + } damon_add_target(ctx, t); } @@ -106,6 +113,10 @@ static void damon_test_aggregate(struct kunit *test) damon_for_each_target(t, ctx) { for (ir = 0; ir < 3; ir++) { r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + if (!r) { + damon_destroy_ctx(ctx); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses = accesses[it][ir]; r->nr_accesses_bp = accesses[it][ir] * 10000; damon_add_region(r, t); From 5e80d73f22043c59c8ad36452a3253937ed77955 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:19:59 -0700 Subject: [PATCH 153/321] mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_at() damon_test_split_at() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-6-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index fd1e1ecaa2c9..f5f3152cb8df 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -148,8 +148,19 @@ static void damon_test_split_at(struct kunit *test) struct damon_target *t; struct damon_region *r, *r_new; + if (!c) + kunit_skip(test, "ctx alloc fail"); t = damon_new_target(); + if (!t) { + damon_destroy_ctx(c); + kunit_skip(test, "target alloc fail"); + } r = damon_new_region(0, 100); + if (!r) { + damon_destroy_ctx(c); + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses_bp = 420000; r->nr_accesses = 42; r->last_nr_accesses = 15; From 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:00 -0700 Subject: [PATCH 154/321] mm/damon/tests/core-kunit: handle alloc failures on damon_test_merge_two() damon_test_merge_two() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-7-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index f5f3152cb8df..e8219fd23318 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -188,11 +188,21 @@ static void damon_test_merge_two(struct kunit *test) int i; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); r = damon_new_region(0, 100); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses = 10; r->nr_accesses_bp = 100000; damon_add_region(r, t); r2 = damon_new_region(100, 300); + if (!r2) { + damon_free_target(t); + kunit_skip(test, "second region alloc fail"); + } r2->nr_accesses = 20; r2->nr_accesses_bp = 200000; damon_add_region(r2, t); From 0998d2757218771c59d5ca59ccf13d1542a38f17 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:01 -0700 Subject: [PATCH 155/321] mm/damon/tests/core-kunit: handle alloc failures on dasmon_test_merge_regions_of() damon_test_merge_regions_of() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-8-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index e8219fd23318..98f2a3de7cea 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -248,8 +248,14 @@ static void damon_test_merge_regions_of(struct kunit *test) int i; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses = nrs[i]; r->nr_accesses_bp = nrs[i] * 10000; damon_add_region(r, t); From eded254cb69044bd4abde87394ea44909708d7c0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:02 -0700 Subject: [PATCH 156/321] mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_regions_of() damon_test_split_regions_of() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-9-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 98f2a3de7cea..10618cdd188e 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -278,15 +278,35 @@ static void damon_test_split_regions_of(struct kunit *test) struct damon_target *t; struct damon_region *r; + if (!c) + kunit_skip("ctx alloc fail"); t = damon_new_target(); + if (!t) { + damon_destroy_ctx(c); + kunit_skip(test, "target alloc fail"); + } r = damon_new_region(0, 22); + if (!r) { + damon_destroy_ctx(c); + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); damon_split_regions_of(t, 2, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); + if (!t) { + damon_destroy_ctx(c); + kunit_skip(test, "second target alloc fail"); + } r = damon_new_region(0, 220); + if (!r) { + damon_destroy_ctx(c); + damon_free_target(t); + kunit_skip(test, "second region alloc fail"); + } damon_add_region(r, t); damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); From 4f835f4e8c863985f15abd69db033c2f66546094 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:03 -0700 Subject: [PATCH 157/321] mm/damon/tests/core-kunit: handle alloc failures in damon_test_ops_registration() damon_test_ops_registration() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-10-sj@kernel.org Fixes: 4f540f5ab4f2 ("mm/damon/core-test: add a kunit test case for ops registration") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.19+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 10618cdd188e..96c8f1269f44 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -320,6 +320,9 @@ static void damon_test_ops_registration(struct kunit *test) struct damon_operations ops = {.id = DAMON_OPS_VADDR}, bak; bool need_cleanup = false; + if (!c) + kunit_skip(test, "ctx alloc fail"); + /* DAMON_OPS_VADDR is registered only if CONFIG_DAMON_VADDR is set */ if (!damon_is_registered_ops(DAMON_OPS_VADDR)) { bak.id = DAMON_OPS_VADDR; From 74d5969995d129fd59dd93b9c7daa6669cb6810f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:04 -0700 Subject: [PATCH 158/321] mm/damon/tests/core-kunit: handle alloc failures in damon_test_set_regions() damon_test_set_regions() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-11-sj@kernel.org Fixes: 62f409560eb2 ("mm/damon/core-test: test damon_set_regions") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.1+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 96c8f1269f44..e38c95f86a68 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -368,13 +368,26 @@ static void damon_test_ops_registration(struct kunit *test) static void damon_test_set_regions(struct kunit *test) { struct damon_target *t = damon_new_target(); - struct damon_region *r1 = damon_new_region(4, 16); - struct damon_region *r2 = damon_new_region(24, 32); + struct damon_region *r1, *r2; struct damon_addr_range range = {.start = 8, .end = 28}; unsigned long expects[] = {8, 16, 16, 24, 24, 28}; int expect_idx = 0; struct damon_region *r; + if (!t) + kunit_skip(test, "target alloc fail"); + r1 = damon_new_region(4, 16); + if (!r1) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } + r2 = damon_new_region(24, 32); + if (!r2) { + damon_free_target(t); + damon_free_region(r1); + kunit_skip(test, "second region alloc fail"); + } + damon_add_region(r1, t); damon_add_region(r2, t); damon_set_regions(t, &range, 1, DAMON_MIN_REGION); From 8cf298c01b7fdb08eef5b6b26d0fe98d48134d72 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:05 -0700 Subject: [PATCH 159/321] mm/damon/tests/core-kunit: handle alloc failures in damon_test_update_monitoring_result() damon_test_update_monitoring_result() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-12-sj@kernel.org Fixes: f4c978b6594b ("mm/damon/core-test: add a test for damon_update_monitoring_results()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.3+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index e38c95f86a68..10c9953581ee 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -429,6 +429,9 @@ static void damon_test_update_monitoring_result(struct kunit *test) struct damon_attrs new_attrs; struct damon_region *r = damon_new_region(3, 7); + if (!r) + kunit_skip(test, "region alloc fail"); + r->nr_accesses = 15; r->nr_accesses_bp = 150000; r->age = 20; From 915a2453d824a9b6bf724e3f970d86ae1d092a61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:06 -0700 Subject: [PATCH 160/321] mm/damon/tests/core-kunit: handle alloc failure on damon_test_set_attrs() damon_test_set_attrs() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-13-sj@kernel.org Fixes: aa13779be6b7 ("mm/damon/core-test: add a test for damon_set_attrs()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.5+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 10c9953581ee..b9bd69a57e62 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -465,6 +465,9 @@ static void damon_test_set_attrs(struct kunit *test) .sample_interval = 5000, .aggr_interval = 100000,}; struct damon_attrs invalid_attrs; + if (!c) + kunit_skip(test, "ctx alloc fail"); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0); invalid_attrs = valid_attrs; From 28ab2265e9422ccd81e4beafc0ace90f78de04c4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:07 -0700 Subject: [PATCH 161/321] mm/damon/tests/core-kunit: handle alloc failres in damon_test_new_filter() damon_test_new_filter() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-14-sj@kernel.org Fixes: 2a158e956b98 ("mm/damon/core-test: add a test for damos_new_filter()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.6+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index b9bd69a57e62..03c7ac31db5c 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -505,6 +505,8 @@ static void damos_test_new_filter(struct kunit *test) struct damos_filter *filter; filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); + if (!filter) + kunit_skip(test, "filter alloc fail"); KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); KUNIT_EXPECT_EQ(test, filter->matching, true); KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); From 3e5c4a1a1737bd79abaaa184233d0f815e62273b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:08 -0700 Subject: [PATCH 162/321] mm/damon/tests/core-kunit: handle alloc failure on damos_test_commit_filter() damon_test_commit_filter() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-15-sj@kernel.org Fixes: f6a4a150f1ec ("mm/damon/tests/core-kunit: add damos_commit_filter test") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.18+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 03c7ac31db5c..5af8275ffd7d 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -516,11 +516,16 @@ static void damos_test_new_filter(struct kunit *test) static void damos_test_commit_filter(struct kunit *test) { - struct damos_filter *src_filter = damos_new_filter( - DAMOS_FILTER_TYPE_ANON, true, true); - struct damos_filter *dst_filter = damos_new_filter( - DAMOS_FILTER_TYPE_ACTIVE, false, false); + struct damos_filter *src_filter, *dst_filter; + src_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + if (!src_filter) + kunit_skip(test, "src filter alloc fail"); + dst_filter = damos_new_filter(DAMOS_FILTER_TYPE_ACTIVE, false, false); + if (!dst_filter) { + damos_destroy_filter(src_filter); + kunit_skip(test, "dst filter alloc fail"); + } damos_commit_filter(dst_filter, src_filter); KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type); KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching); From d14d5671e7c9cc788c5a1edfa94e6f9064275905 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:09 -0700 Subject: [PATCH 163/321] mm/damon/tests/core-kunit: handle alloc failures on damos_test_filter_out() damon_test_filter_out() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-16-sj@kernel.org Fixes: 26713c890875 ("mm/damon/core-test: add a unit test for __damos_filter_out()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.6+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 5af8275ffd7d..a03ae9ddd88a 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -542,11 +542,22 @@ static void damos_test_filter_out(struct kunit *test) struct damos_filter *f; f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); + if (!f) + kunit_skip(test, "filter alloc fail"); f->addr_range = (struct damon_addr_range){ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; t = damon_new_target(); + if (!t) { + damos_destroy_filter(f); + kunit_skip(test, "target alloc fail"); + } r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5); + if (!r) { + damos_destroy_filter(f); + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); /* region in the range */ From 84be856cc87317bc60ff54bd7c8f8a5aa8f0e2c8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:10 -0700 Subject: [PATCH 164/321] mm/damon/tests/core-kunit: handle alloc failures on damon_test_set_filters_default_reject() damon_test_set_filters_default_reject() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-17-sj@kernel.org Fixes: 094fb14913c7 ("mm/damon/tests/core-kunit: add a test for damos_set_filters_default_reject()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.16+] Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index a03ae9ddd88a..a91d798caa70 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -659,6 +659,8 @@ static void damon_test_set_filters_default_reject(struct kunit *test) KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true); + if (!target_filter) + kunit_skip(test, "filter alloc fail"); damos_add_filter(&scheme, target_filter); damos_set_filters_default_reject(&scheme); /* @@ -684,6 +686,10 @@ static void damon_test_set_filters_default_reject(struct kunit *test) KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + if (!anon_filter) { + damos_free_filter(target_filter); + kunit_skip(test, "anon_filter alloc fail"); + } damos_add_filter(&scheme, anon_filter); damos_set_filters_default_reject(&scheme); From 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:11 -0700 Subject: [PATCH 165/321] mm/damon/tests/vaddr-kunit: handle alloc failures on damon_do_test_apply_three_regions() damon_do_test_apply_three_regions() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-18-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index fce38dd53cf8..484223f19545 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -136,8 +136,14 @@ static void damon_do_test_apply_three_regions(struct kunit *test, int i; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); for (i = 0; i < nr_regions / 2; i++) { r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + if (!r) { + damon_destroy_target(t, NULL); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); } From 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:12 -0700 Subject: [PATCH 166/321] mm/damon/tests/vaddr-kunit: handle alloc failures in damon_test_split_evenly_fail() damon_test_split_evenly_fail() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-19-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 484223f19545..1b0f21c2e376 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -256,7 +256,16 @@ static void damon_test_split_evenly_fail(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { struct damon_target *t = damon_new_target(); - struct damon_region *r = damon_new_region(start, end); + struct damon_region *r; + + if (!t) + kunit_skip(test, "target alloc fail"); + + r = damon_new_region(start, end); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); KUNIT_EXPECT_EQ(test, From 0a63a0e7570b9b2631dfb8d836dc572709dce39e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:13 -0700 Subject: [PATCH 167/321] mm/damon/tests/vaddr-kunit: handle alloc failures on damon_test_split_evenly_succ() damon_test_split_evenly_succ() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-20-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [5.15+] Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 1b0f21c2e376..30dc5459f1d2 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -284,10 +284,17 @@ static void damon_test_split_evenly_succ(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { struct damon_target *t = damon_new_target(); - struct damon_region *r = damon_new_region(start, end); + struct damon_region *r; unsigned long expected_width = (end - start) / nr_pieces; unsigned long i = 0; + if (!t) + kunit_skip(test, "target alloc fail"); + r = damon_new_region(start, end); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, nr_pieces), 0); From 7d808bf13943f4c6a6142400bffe14267f6dc997 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:14 -0700 Subject: [PATCH 168/321] mm/damon/tests/sysfs-kunit: handle alloc failures on damon_sysfs_test_add_targets() damon_sysfs_test_add_targets() is assuming all dynamic memory allocation in it will succeed. Those are indeed likely in the real use cases since those allocations are too small to fail, but theoretically those could fail. In the case, inappropriate memory access can happen. Fix it by appropriately cleanup pre-allocated memory and skip the execution of the remaining tests in the failure cases. Link: https://lkml.kernel.org/r/20251101182021.74868-21-sj@kernel.org Fixes: b8ee5575f763 ("mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Cc: [6.7+] Signed-off-by: Andrew Morton --- mm/damon/tests/sysfs-kunit.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mm/damon/tests/sysfs-kunit.h b/mm/damon/tests/sysfs-kunit.h index 7b5c7b307da9..ce7218469f20 100644 --- a/mm/damon/tests/sysfs-kunit.h +++ b/mm/damon/tests/sysfs-kunit.h @@ -45,16 +45,41 @@ static void damon_sysfs_test_add_targets(struct kunit *test) struct damon_ctx *ctx; sysfs_targets = damon_sysfs_targets_alloc(); + if (!sysfs_targets) + kunit_skip(test, "sysfs_targets alloc fail"); sysfs_targets->nr = 1; sysfs_targets->targets_arr = kmalloc_array(1, sizeof(*sysfs_targets->targets_arr), GFP_KERNEL); + if (!sysfs_targets->targets_arr) { + kfree(sysfs_targets); + kunit_skip(test, "targets_arr alloc fail"); + } sysfs_target = damon_sysfs_target_alloc(); + if (!sysfs_target) { + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kunit_skip(test, "sysfs_target alloc fail"); + } sysfs_target->pid = __damon_sysfs_test_get_any_pid(12, 100); sysfs_target->regions = damon_sysfs_regions_alloc(); + if (!sysfs_target->regions) { + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kfree(sysfs_target); + kunit_skip(test, "sysfs_regions alloc fail"); + } + sysfs_targets->targets_arr[0] = sysfs_target; ctx = damon_new_ctx(); + if (!ctx) { + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kfree(sysfs_target); + kfree(sysfs_target->regions); + kunit_skip(test, "ctx alloc fail"); + } damon_sysfs_add_targets(ctx, sysfs_targets); KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx)); From 40b11d1eb19cf5c53a642d35f27fd1eafd9e0caf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:15 -0700 Subject: [PATCH 169/321] mm/damon/tests/core-kunit: remove unnecessary damon_ctx variable on damon_test_split_at() damon_test_split_at() dynamically allocates a 'struct damon_ctx' object, but it is not really being used in the code other than handling the allocation failure and deallocating it at the end of the function. Remove the unnecessary allocation and deallocation of the object. Link: https://lkml.kernel.org/r/20251101182021.74868-22-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index a91d798caa70..726f593930e7 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -144,20 +144,14 @@ static void damon_test_aggregate(struct kunit *test) static void damon_test_split_at(struct kunit *test) { - struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; struct damon_region *r, *r_new; - if (!c) - kunit_skip(test, "ctx alloc fail"); t = damon_new_target(); - if (!t) { - damon_destroy_ctx(c); + if (!t) kunit_skip(test, "target alloc fail"); - } r = damon_new_region(0, 100); if (!r) { - damon_destroy_ctx(c); damon_free_target(t); kunit_skip(test, "region alloc fail"); } @@ -178,7 +172,6 @@ static void damon_test_split_at(struct kunit *test) KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses); damon_free_target(t); - damon_destroy_ctx(c); } static void damon_test_merge_two(struct kunit *test) From 80d725f96c44e6e8eff02a9820dbbeef6a52091a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Nov 2025 11:20:16 -0700 Subject: [PATCH 170/321] mm/damon/tests/core-kunit: remove unused ctx in damon_test_split_regions_of() damon_test_split_regions_of() dynamically allocates a 'struct damon_ctx' object, but it is not really being used in the code other than handling the allocation failure and deallocating it at the end of the function. Remove the unnecessary allocation and deallocation of the object. Link: https://lkml.kernel.org/r/20251101182021.74868-23-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 726f593930e7..96a4cd489b39 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -267,20 +267,14 @@ static void damon_test_merge_regions_of(struct kunit *test) static void damon_test_split_regions_of(struct kunit *test) { - struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; struct damon_region *r; - if (!c) - kunit_skip("ctx alloc fail"); t = damon_new_target(); - if (!t) { - damon_destroy_ctx(c); + if (!t) kunit_skip(test, "target alloc fail"); - } r = damon_new_region(0, 22); if (!r) { - damon_destroy_ctx(c); damon_free_target(t); kunit_skip(test, "region alloc fail"); } @@ -290,13 +284,10 @@ static void damon_test_split_regions_of(struct kunit *test) damon_free_target(t); t = damon_new_target(); - if (!t) { - damon_destroy_ctx(c); + if (!t) kunit_skip(test, "second target alloc fail"); - } r = damon_new_region(0, 220); if (!r) { - damon_destroy_ctx(c); damon_free_target(t); kunit_skip(test, "second region alloc fail"); } @@ -304,7 +295,6 @@ static void damon_test_split_regions_of(struct kunit *test) damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); - damon_destroy_ctx(c); } static void damon_test_ops_registration(struct kunit *test) From d94d9293a1ecde99514026950ded294dd5562e30 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 18 Sep 2025 11:46:53 +0800 Subject: [PATCH 171/321] mm: vmscan: remove folio_test_private() check in pageout() Patch series "some cleanups for pageout()", v2. Since we no longer attempt to write back filesystem folios in pageout(), and only tmpfs/shmem folios and anonymous swapcache folios can be written back, we can remove the redundant folio_test_private() related logic to simplify the logic of pageout(), as tmpfs/shmem and swapcache folios do not use the PG_private flag. This patch (of 2): The folio_test_private() check in pageout() was introduced by commit ce91b575332b ("orphaned pagecache memleak fix") in 2005 (checked from a history tree[1]). As the commit message mentioned, it was to address the issue where reiserfs pagecache may be truncated while still pinned. To further explain, the truncation removes the page->mapping, but the page is still listed in the VM queues because it still has buffers. In 2008, commit a2b345642f530 ("Fix dirty page accounting leak with ext3 data=journal") seems to be dealing with a similar issue, where the page becomes dirty after truncation, and it provides a very useful call stack: truncate_complete_page() cancel_dirty_page() // PG_dirty cleared, decr. dirty pages do_invalidatepage() ext3_invalidatepage() journal_invalidatepage() journal_unmap_buffer() __dispose_buffer() __journal_unfile_buffer() __journal_temp_unlink_buffer() mark_buffer_dirty(); // PG_dirty set, incr. dirty pages In this commit a2b345642f530, we forcefully clear the page's dirty flag during truncation (in truncate_complete_page()). Now it seems this was just a peculiar usage specific to reiserfs. Maybe reiserfs had some extra refcount on these pages, which caused them to pass the is_page_cache_freeable() check. With the fix provided by commit a2b345642f530 and reiserfs being removed in 2024 by commit fb6f20ecb121 ("reiserfs: The last commit"), such a case is unlikely to occur again. So let's remove the redundant folio_test_private() checks and related buffer_head release logic, and just leave a warning here to catch such a bug. [akpm@linux-foundation.org: redo comment, per David] Link: https://lkml.kernel.org/r/17d1b293-e393-4989-a357-7eea74b3c805@redhat.com [baolin.wang@linux.alibaba.com: remove comment and WARNing, per Hugh and others] Link: https://lkml.kernel.org/r/392a9ca3-31ac-4447-bd44-3c656d63e4ca@linux.alibaba.com Link: https://lkml.kernel.org/r/cover.1758166683.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/9ef0e560dc83650bc538eb5dcd1594e112c1369f.1758166683.git.baolin.wang@linux.alibaba.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1] Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/vmscan.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c23c9616052a..0684da19aa64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -697,23 +697,8 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. */ - if (!is_page_cache_freeable(folio)) + if (!is_page_cache_freeable(folio) || !mapping) return PAGE_KEEP; - if (!mapping) { - /* - * Some data journaling orphaned folios can have - * folio->mapping == NULL while being dirty with clean buffers. - */ - if (folio_test_private(folio)) { - if (try_to_free_buffers(folio)) { - folio_clear_dirty(folio); - pr_info("%s: orphaned folio\n", __func__); - return PAGE_CLEAN; - } - } - return PAGE_KEEP; - } - if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; if (!folio_clear_dirty_for_io(folio)) From 4f8961b29501f40a044bba56f61cc9b7e9bbdf94 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 18 Sep 2025 11:46:54 +0800 Subject: [PATCH 172/321] mm: vmscan: simplify the folio refcount check in pageout() Since we no longer attempt to write back filesystem folios in pageout() (they will be filtered out by the following check in pageout()), and only tmpfs/shmem folios and anonymous swapcache folios can be written back, we can remove the redundant folio_test_private() when checking the folio's refcount, as tmpfs/shmem and swapcache folios do not use the PG_private flag. While we're at it, we can open-code the folio refcount check instead of adding a simple helper that has only one user. Link: https://lkml.kernel.org/r/4cbbec5bb92397aa4597105f1f499aabf7a1901c.1758166683.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/vmscan.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0684da19aa64..51ffd32e6e01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -477,17 +477,6 @@ static int reclaimer_offset(struct scan_control *sc) return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } -static inline int is_page_cache_freeable(struct folio *folio) -{ - /* - * A freeable page cache folio is referenced only by the caller - * that isolated the folio, the page cache and optional filesystem - * private data at folio->private. - */ - return folio_ref_count(folio) - folio_test_private(folio) == - 1 + folio_nr_pages(folio); -} - /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -696,8 +685,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. + * + * A freeable shmem or swapcache folio is referenced only by the + * caller that isolated the folio and the page cache. */ - if (!is_page_cache_freeable(folio) || !mapping) + if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping) return PAGE_KEEP; if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; From 3b12a53b64d0c86cf68cab772bd4137e451b17a5 Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Sat, 8 Nov 2025 21:48:29 +0530 Subject: [PATCH 173/321] selftest/mm: fix pointer comparison in mremap_test Pointer arthemitic with 'void * addr' and 'ulong dest_alignment' triggers following warning: mremap_test.c:1035:31: warning: pointer comparison always evaluates to false [-Wtautological-compare] 1035 | if (addr + c.dest_alignment < addr) { | ^ this warning is raised from clang version 20.1.8 (Fedora 20.1.8-4.fc42). use 'void *tmp_addr' to do the pointer arthemitic. Link: https://lkml.kernel.org/r/20251108161829.25105-1-ankitkhushwaha.linux@gmail.com Signed-off-by: Ankit Khushwaha Acked-by: Mike Rapoport (Microsoft) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index bf2863b102e3..5f073504e0b1 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -994,7 +994,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_siz static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { - void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; + void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -1032,7 +1032,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Don't destroy existing mappings unless expected to overlap */ while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { + tmp_addr = addr + c.dest_alignment; + if (tmp_addr < addr) { ksft_print_msg("Couldn't find a valid region to remap to\n"); ret = -1; goto clean_up_src; From 340b59816bc417c306cd76b867914cfb4f386d2d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Nov 2025 16:57:09 +0800 Subject: [PATCH 174/321] mm: kill mm_wr_locked from unmap_vmas() and unmap_single_vma() Kill mm_wr_locked since commit f8e97613fed2 ("mm: convert VM_PFNMAP tracking to pfnmap_track() + pfnmap_untrack()") remove the user. Link: https://lkml.kernel.org/r/20251104085709.2688433-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/memory.c | 12 ++++-------- mm/mmap.c | 2 +- mm/vma.c | 5 ++--- tools/testing/vma/vma_internal.h | 3 +-- 5 files changed, 9 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b636d12bb651..df9f258a017c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2480,7 +2480,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end); struct mmu_notifier_range; diff --git a/mm/memory.c b/mm/memory.c index 8d8c36adafa8..b09de6274da3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2023,8 +2023,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, - struct zap_details *details, bool mm_wr_locked) + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -2070,7 +2069,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check - * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -2085,8 +2083,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { struct mmu_notifier_range range; struct zap_details details = { @@ -2102,8 +2099,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details, - mm_wr_locked); + unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); @@ -2139,7 +2135,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); if (is_vm_hugetlb_page(vma)) { /* diff --git a/mm/mmap.c b/mm/mmap.c index 644f02071a41..4f51ca644903 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1274,7 +1274,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); mmap_read_unlock(mm); /* diff --git a/mm/vma.c b/mm/vma.c index 919d1fc63a52..0c5e391fe2e2 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -483,8 +483,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, - /* mm_wr_locked = */ true); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, @@ -1228,7 +1227,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count, mm_wr_locked); + vms->vma_count); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index d873667704e8..c68d382dac81 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -848,8 +848,7 @@ static inline void update_hiwater_vm(struct mm_struct *mm) static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { } From 135e541ae8f3f166453177f1a94a0ff1f86ce30f Mon Sep 17 00:00:00 2001 From: Zeng Chi Date: Wed, 5 Nov 2025 10:39:25 +0800 Subject: [PATCH 175/321] lib/alloc_tag: use %pe format specifier The %pe format specifier is designed to print error pointers. It prints a symbolic error name (eg. -EINVAL) and it makes the code simpler by omitting PTR_ERR(); This patch fixes this cocci report: lib/alloc_tag.c:776:63-70: WARNING: Consider using %pe to print PTR_ERR() Link: https://lkml.kernel.org/r/20251105023925.1447482-1-zeng_chi911@163.com Signed-off-by: Zeng Chi Acked-by: SeongJae Park Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index f26456988445..27fee57a5c91 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -845,7 +845,7 @@ static int __init alloc_tag_init(void) alloc_tag_cttype = codetag_register_type(&desc); if (IS_ERR(alloc_tag_cttype)) { - pr_err("Allocation tags registration failed, errno = %ld\n", PTR_ERR(alloc_tag_cttype)); + pr_err("Allocation tags registration failed, errno = %pe\n", alloc_tag_cttype); free_mod_tags_mem(); shutdown_mem_profiling(true); return PTR_ERR(alloc_tag_cttype); From e24f66e87bfbcd15a95336c30c2f131332855ba6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Nov 2025 20:14:48 +0000 Subject: [PATCH 176/321] hugetlb: optimise hugetlb_folio_init_tail_vmemmap() Extract the zone number directly from the folio instead of using the folio's zone number to look up the zone and asking the zone what its number is. Also we should use &folio->page instead of casting from folio to page Link: https://lkml.kernel.org/r/20251106201452.2292631-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1ea459723cce..ac5ce2b2b87d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,7 +3246,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { - enum zone_type zone = zone_idx(folio_zone(folio)); + enum zone_type zone = folio_zonenum(folio); int nid = folio_nid(folio); struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); @@ -3279,7 +3279,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); - prep_compound_head((struct page *)folio, huge_page_order(h)); + prep_compound_head(&folio->page, huge_page_order(h)); } static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) From c537f0dd30344434b6e7585768d3fa38190d2d0c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Nov 2025 20:14:49 +0000 Subject: [PATCH 177/321] migrate: optimise alloc_migration_target() Extract the zone number directly from the folio instead of using the folio's zone number to look up the zone and asking the zone what its number is. [ziy@nvidia.com: fix folio_zonenum() return type] Link: https://lkml.kernel.org/r/26E8FF35-503E-4F14-98F7-7B4FA25FBD37@nvidia.com Link: https://lkml.kernel.org/r/20251106201452.2292631-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Zi Yan Cc: Matthew Brost Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Signed-off-by: Andrew Morton --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 7c5d2efb9d47..08d034dbeb98 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2164,7 +2164,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_t gfp_mask; unsigned int order = 0; int nid; - int zidx; + enum zone_type zidx; mtc = (struct migration_target_control *)private; gfp_mask = mtc->gfp_mask; @@ -2190,7 +2190,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_mask |= GFP_TRANSHUGE; order = folio_order(src); } - zidx = zone_idx(folio_zone(src)); + zidx = folio_zonenum(src); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; From 20605eb5bbea8184d2bb356d7e1419c8ec359efb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Nov 2025 20:14:50 +0000 Subject: [PATCH 178/321] memory_hotplug: optimise try_offline_memory_block() Extract the zone number directly from the page instead of using the page's zone number to look up the zone and asking the zone what its number is. Link: https://lkml.kernel.org/r/20251106201452.2292631-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 94a8f6e8811a..63b9d500ec6c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -2327,7 +2327,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) * by offlining code ... so we don't care about that. */ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); - if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + if (page && page_zonenum(page) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; rc = device_offline(&mem->dev); From 98be155451eb2aa4b0413b85c3f95e239de3636f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Nov 2025 20:35:25 +0000 Subject: [PATCH 179/321] mm: constify __dump_folio() arguments These arguments aren't modified by the function; mark them as const to help the compiler. Link: https://lkml.kernel.org/r/20251106203526.2368275-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/debug.c b/mm/debug.c index d860864063be..77fa8fe1d641 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,7 +67,7 @@ static const char *page_type_name(unsigned int page_type) return page_type_names[i]; } -static void __dump_folio(struct folio *folio, struct page *page, +static void __dump_folio(const struct folio *folio, const struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); From ecd6703f64d76ee4fc8cc2205bfb892d3bb9f538 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 6 Nov 2025 11:08:22 +0800 Subject: [PATCH 180/321] mm/hugetlb: extract sysfs into hugetlb_sysfs.c Patch series "mm/hugetlb: refactor sysfs/sysctl interfaces", v5. hugetlb.c has grown significantly and become difficult to maintain. This patch series extracts the sysfs and sysctl interface code into separate dedicated files to improve code organization. The refactoring includes: - Patch 1: Extract sysfs interface into mm/hugetlb_sysfs.c - Patch 2: Extract sysctl interface into mm/hugetlb_sysctl.c No functional changes are introduced in this series. The code is moved as-is, with only minor formatting adjustments for code style consistency. This should make future maintenance and enhancements to the hugetlb subsystem easier. Testing: The patch series has been compile-tested and maintains the same functionality as the original code. This patch (of 2): Currently, hugetlb.c contains both core management logic and sysfs interface implementations, making it difficult to maintain. This patch extracts the sysfs-related code into a dedicated file to improve code organization. The following components are moved to mm/hugetlb_sysfs.c: - sysfs attribute definitions and handlers - sysfs kobject management functions - NUMA per-node hstate attribute registration Several inline helper functions and macros are moved to mm/hugetlb_internal.h: - hstate_is_gigantic_no_runtime() - next_node_allowed() - get_valid_node_allowed() - hstate_next_node_to_alloc() - hstate_next_node_to_free() - for_each_node_mask_to_alloc/to_free macros To support code sharing, these functions are changed from static to exported symbols: - remove_hugetlb_folio() - add_hugetlb_folio() - init_new_hugetlb_folio() - prep_and_add_allocated_folios() - demote_pool_huge_page() - __nr_hugepages_store_common() The Makefile is updated to compile hugetlb_sysfs.o when CONFIG_HUGETLBFS is enabled. This maintains all existing functionality while improving maintainability by separating concerns. MAINTAINERS is updated to add new file hugetlb_sysfs.c. Link: https://lkml.kernel.org/r/cover.1762398359.git.zhuhui@kylinos.cn Link: https://lkml.kernel.org/r/656a03dff7e2bb20e24e841ede81fdca01d21410.1762398359.git.zhuhui@kylinos.cn Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Cc: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + mm/Makefile | 2 +- mm/hugetlb.c | 596 +----------------------------------------- mm/hugetlb_internal.h | 111 ++++++++ mm/hugetlb_sysfs.c | 502 +++++++++++++++++++++++++++++++++++ 5 files changed, 626 insertions(+), 586 deletions(-) create mode 100644 mm/hugetlb_internal.h create mode 100644 mm/hugetlb_sysfs.c diff --git a/MAINTAINERS b/MAINTAINERS index 5cf6873569d3..72870562746b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11540,6 +11540,7 @@ F: mm/hugetlb.c F: mm/hugetlb_cgroup.c F: mm/hugetlb_cma.c F: mm/hugetlb_cma.h +F: mm/hugetlb_sysfs.c F: mm/hugetlb_vmemmap.c F: mm/hugetlb_vmemmap.h F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c diff --git a/mm/Makefile b/mm/Makefile index 21abb3353550..b9edfce6c202 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,7 +78,7 @@ endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o -obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o hugetlb_sysfs.o ifdef CONFIG_CMA obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac5ce2b2b87d..26b2a319b002 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -46,13 +45,12 @@ #include #include -#include -#include #include #include #include "internal.h" #include "hugetlb_vmemmap.h" #include "hugetlb_cma.h" +#include "hugetlb_internal.h" #include int hugetlb_max_hstate __read_mostly; @@ -134,17 +132,6 @@ static void hugetlb_free_folio(struct folio *folio) folio_put(folio); } -/* - * Check if the hstate represents gigantic pages but gigantic page - * runtime support is not available. This is a common condition used to - * skip operations that cannot be performed on gigantic pages when runtime - * support is disabled. - */ -static inline bool hstate_is_gigantic_no_runtime(struct hstate *h) -{ - return hstate_is_gigantic(h) && !gigantic_page_runtime_supported(); -} - static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1431,77 +1418,6 @@ err: return NULL; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node_in(nid, *nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(int *next_node, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(*next_node, nodes_allowed); - *next_node = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for remove_pool_hugetlb_folio() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, @@ -1557,8 +1473,8 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, * * Must be called with hugetlb lock held. */ -static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) +void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1593,8 +1509,8 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, h->nr_huge_pages_node[nid]--; } -static void add_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) +void add_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1925,7 +1841,7 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) h->nr_huge_pages_node[folio_nid(folio)]++; } -static void init_new_hugetlb_folio(struct folio *folio) +void init_new_hugetlb_folio(struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); @@ -2037,8 +1953,8 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, return folio; } -static void prep_and_add_allocated_folios(struct hstate *h, - struct list_head *folio_list) +void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; @@ -4093,8 +4009,8 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, return rc; } -static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, - unsigned long nr_to_demote) +long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; @@ -4162,51 +4078,7 @@ static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, return -EBUSY; } -#define HSTATE_ATTR_RO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RO(_name) - -#define HSTATE_ATTR_WO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_WO(_name) - -#define HSTATE_ATTR(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RW(_name) - -static struct kobject *hugepages_kobj; -static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; - -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); - -static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) -{ - int i; - - for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (hstate_kobjs[i] == kobj) { - if (nidp) - *nidp = NUMA_NO_NODE; - return &hstates[i]; - } - - return kobj_to_node_hstate(kobj, nidp); -} - -static ssize_t nr_hugepages_show_common(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long nr_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - nr_huge_pages = h->nr_huge_pages; - else - nr_huge_pages = h->nr_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", nr_huge_pages); -} - -static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, +ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { @@ -4239,452 +4111,6 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, return err ? err : len; } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, const char *buf, - size_t len) -{ - struct hstate *h; - unsigned long count; - int nid; - int err; - - err = kstrtoul(buf, 10, &count); - if (err) - return err; - - h = kobj_to_hstate(kobj, &nid); - return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); -} - -static ssize_t nr_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return nr_hugepages_show_common(kobj, attr, buf); -} - -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return nr_hugepages_store_common(false, kobj, buf, len); -} -HSTATE_ATTR(nr_hugepages); - -#ifdef CONFIG_NUMA - -/* - * hstate attribute for optionally mempolicy-based constraint on persistent - * huge page alloc/free. - */ -static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return nr_hugepages_show_common(kobj, attr, buf); -} - -static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return nr_hugepages_store_common(true, kobj, buf, len); -} -HSTATE_ATTR(nr_hugepages_mempolicy); -#endif - - -static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); -} - -static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long input; - struct hstate *h = kobj_to_hstate(kobj, NULL); - - if (hstate_is_gigantic_no_runtime(h)) - return -EINVAL; - - err = kstrtoul(buf, 10, &input); - if (err) - return err; - - spin_lock_irq(&hugetlb_lock); - h->nr_overcommit_huge_pages = input; - spin_unlock_irq(&hugetlb_lock); - - return count; -} -HSTATE_ATTR(nr_overcommit_hugepages); - -static ssize_t free_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long free_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - free_huge_pages = h->free_huge_pages; - else - free_huge_pages = h->free_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", free_huge_pages); -} -HSTATE_ATTR_RO(free_hugepages); - -static ssize_t resv_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); -} -HSTATE_ATTR_RO(resv_hugepages); - -static ssize_t surplus_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long surplus_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - surplus_huge_pages = h->surplus_huge_pages; - else - surplus_huge_pages = h->surplus_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", surplus_huge_pages); -} -HSTATE_ATTR_RO(surplus_hugepages); - -static ssize_t demote_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - unsigned long nr_demote; - unsigned long nr_available; - nodemask_t nodes_allowed, *n_mask; - struct hstate *h; - int err; - int nid; - - err = kstrtoul(buf, 10, &nr_demote); - if (err) - return err; - h = kobj_to_hstate(kobj, &nid); - - if (nid != NUMA_NO_NODE) { - init_nodemask_of_node(&nodes_allowed, nid); - n_mask = &nodes_allowed; - } else { - n_mask = &node_states[N_MEMORY]; - } - - /* Synchronize with other sysfs operations modifying huge pages */ - mutex_lock(&h->resize_lock); - spin_lock_irq(&hugetlb_lock); - - while (nr_demote) { - long rc; - - /* - * Check for available pages to demote each time thorough the - * loop as demote_pool_huge_page will drop hugetlb_lock. - */ - if (nid != NUMA_NO_NODE) - nr_available = h->free_huge_pages_node[nid]; - else - nr_available = h->free_huge_pages; - nr_available -= h->resv_huge_pages; - if (!nr_available) - break; - - rc = demote_pool_huge_page(h, n_mask, nr_demote); - if (rc < 0) { - err = rc; - break; - } - - nr_demote -= rc; - } - - spin_unlock_irq(&hugetlb_lock); - mutex_unlock(&h->resize_lock); - - if (err) - return err; - return len; -} -HSTATE_ATTR_WO(demote); - -static ssize_t demote_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; - - return sysfs_emit(buf, "%lukB\n", demote_size); -} - -static ssize_t demote_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct hstate *h, *demote_hstate; - unsigned long demote_size; - unsigned int demote_order; - - demote_size = (unsigned long)memparse(buf, NULL); - - demote_hstate = size_to_hstate(demote_size); - if (!demote_hstate) - return -EINVAL; - demote_order = demote_hstate->order; - if (demote_order < HUGETLB_PAGE_ORDER) - return -EINVAL; - - /* demote order must be smaller than hstate order */ - h = kobj_to_hstate(kobj, NULL); - if (demote_order >= h->order) - return -EINVAL; - - /* resize_lock synchronizes access to demote size and writes */ - mutex_lock(&h->resize_lock); - h->demote_order = demote_order; - mutex_unlock(&h->resize_lock); - - return count; -} -HSTATE_ATTR(demote_size); - -static struct attribute *hstate_attrs[] = { - &nr_hugepages_attr.attr, - &nr_overcommit_hugepages_attr.attr, - &free_hugepages_attr.attr, - &resv_hugepages_attr.attr, - &surplus_hugepages_attr.attr, -#ifdef CONFIG_NUMA - &nr_hugepages_mempolicy_attr.attr, -#endif - NULL, -}; - -static const struct attribute_group hstate_attr_group = { - .attrs = hstate_attrs, -}; - -static struct attribute *hstate_demote_attrs[] = { - &demote_size_attr.attr, - &demote_attr.attr, - NULL, -}; - -static const struct attribute_group hstate_demote_attr_group = { - .attrs = hstate_demote_attrs, -}; - -static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, - struct kobject **hstate_kobjs, - const struct attribute_group *hstate_attr_group) -{ - int retval; - int hi = hstate_index(h); - - hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); - if (!hstate_kobjs[hi]) - return -ENOMEM; - - retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); - if (retval) { - kobject_put(hstate_kobjs[hi]); - hstate_kobjs[hi] = NULL; - return retval; - } - - if (h->demote_order) { - retval = sysfs_create_group(hstate_kobjs[hi], - &hstate_demote_attr_group); - if (retval) { - pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); - sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); - kobject_put(hstate_kobjs[hi]); - hstate_kobjs[hi] = NULL; - return retval; - } - } - - return 0; -} - -#ifdef CONFIG_NUMA -static bool hugetlb_sysfs_initialized __ro_after_init; - -/* - * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node devices in node_devices[] using a parallel array. The array - * index of a node device or _hstate == node id. - * This is here to avoid any static dependency of the node device driver, in - * the base kernel, on the hugetlb module. - */ -struct node_hstate { - struct kobject *hugepages_kobj; - struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; -}; -static struct node_hstate node_hstates[MAX_NUMNODES]; - -/* - * A subset of global hstate attributes for node devices - */ -static struct attribute *per_node_hstate_attrs[] = { - &nr_hugepages_attr.attr, - &free_hugepages_attr.attr, - &surplus_hugepages_attr.attr, - NULL, -}; - -static const struct attribute_group per_node_hstate_attr_group = { - .attrs = per_node_hstate_attrs, -}; - -/* - * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. - * Returns node id via non-NULL nidp. - */ -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) -{ - int nid; - - for (nid = 0; nid < nr_node_ids; nid++) { - struct node_hstate *nhs = &node_hstates[nid]; - int i; - for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (nhs->hstate_kobjs[i] == kobj) { - if (nidp) - *nidp = nid; - return &hstates[i]; - } - } - - BUG(); - return NULL; -} - -/* - * Unregister hstate attributes from a single node device. - * No-op if no hstate attributes attached. - */ -void hugetlb_unregister_node(struct node *node) -{ - struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->dev.id]; - - if (!nhs->hugepages_kobj) - return; /* no hstate attributes */ - - for_each_hstate(h) { - int idx = hstate_index(h); - struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; - - if (!hstate_kobj) - continue; - if (h->demote_order) - sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); - sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); - kobject_put(hstate_kobj); - nhs->hstate_kobjs[idx] = NULL; - } - - kobject_put(nhs->hugepages_kobj); - nhs->hugepages_kobj = NULL; -} - - -/* - * Register hstate attributes for a single node device. - * No-op if attributes already registered. - */ -void hugetlb_register_node(struct node *node) -{ - struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->dev.id]; - int err; - - if (!hugetlb_sysfs_initialized) - return; - - if (nhs->hugepages_kobj) - return; /* already allocated */ - - nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->dev.kobj); - if (!nhs->hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, - nhs->hstate_kobjs, - &per_node_hstate_attr_group); - if (err) { - pr_err("HugeTLB: Unable to add hstate %s for node %d\n", - h->name, node->dev.id); - hugetlb_unregister_node(node); - break; - } - } -} - -/* - * hugetlb init time: register hstate attributes for all registered node - * devices of nodes that have memory. All on-line nodes should have - * registered their associated device by this time. - */ -static void __init hugetlb_register_all_nodes(void) -{ - int nid; - - for_each_online_node(nid) - hugetlb_register_node(node_devices[nid]); -} -#else /* !CONFIG_NUMA */ - -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) -{ - BUG(); - if (nidp) - *nidp = -1; - return NULL; -} - -static void hugetlb_register_all_nodes(void) { } - -#endif - -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s\n", h->name); - } - -#ifdef CONFIG_NUMA - hugetlb_sysfs_initialized = true; -#endif - hugetlb_register_all_nodes(); -} - #ifdef CONFIG_SYSCTL static void hugetlb_sysctl_init(void); #else diff --git a/mm/hugetlb_internal.h b/mm/hugetlb_internal.h new file mode 100644 index 000000000000..5ea372500de7 --- /dev/null +++ b/mm/hugetlb_internal.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Internal HugeTLB definitions. + * (C) Nadia Yvette Chambers, April 2004 + */ + +#ifndef _LINUX_HUGETLB_INTERNAL_H +#define _LINUX_HUGETLB_INTERNAL_H + +#include +#include + +/* + * Check if the hstate represents gigantic pages but gigantic page + * runtime support is not available. This is a common condition used to + * skip operations that cannot be performed on gigantic pages when runtime + * support is disabled. + */ +static inline bool hstate_is_gigantic_no_runtime(struct hstate *h) +{ + return hstate_is_gigantic(h) && !gigantic_page_runtime_supported(); +} + +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static inline int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node_in(nid, *nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static inline int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static inline int hstate_next_node_to_alloc(int *next_node, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(*next_node, nodes_allowed); + *next_node = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for remove_pool_hugetlb_folio() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static inline int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +extern void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus); +extern void add_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus); +extern void init_new_hugetlb_folio(struct folio *folio); +extern void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list); +extern long demote_pool_huge_page(struct hstate *src, + nodemask_t *nodes_allowed, + unsigned long nr_to_demote); +extern ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len); + +extern void hugetlb_sysfs_init(void) __init; + +#endif /* _LINUX_HUGETLB_INTERNAL_H */ diff --git a/mm/hugetlb_sysfs.c b/mm/hugetlb_sysfs.c new file mode 100644 index 000000000000..79ece91406bf --- /dev/null +++ b/mm/hugetlb_sysfs.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HugeTLB sysfs interfaces. + * (C) Nadia Yvette Chambers, April 2004 + */ + +#include +#include +#include + +#include "hugetlb_vmemmap.h" +#include "hugetlb_internal.h" + +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR_WO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_WO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RW(_name) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) +{ + int i; + + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; + return &hstates[i]; + } + + return kobj_to_node_hstate(kobj, nidp); +} + +static ssize_t nr_hugepages_show_common(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", nr_huge_pages); +} + +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages); + +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); +} + +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj, NULL); + + if (hstate_is_gigantic_no_runtime(h)) + return -EINVAL; + + err = kstrtoul(buf, 10, &input); + if (err) + return err; + + spin_lock_irq(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock_irq(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static ssize_t demote_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_demote; + unsigned long nr_available; + nodemask_t nodes_allowed, *n_mask; + struct hstate *h; + int err; + int nid; + + err = kstrtoul(buf, 10, &nr_demote); + if (err) + return err; + h = kobj_to_hstate(kobj, &nid); + + if (nid != NUMA_NO_NODE) { + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } else { + n_mask = &node_states[N_MEMORY]; + } + + /* Synchronize with other sysfs operations modifying huge pages */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); + + while (nr_demote) { + long rc; + + /* + * Check for available pages to demote each time thorough the + * loop as demote_pool_huge_page will drop hugetlb_lock. + */ + if (nid != NUMA_NO_NODE) + nr_available = h->free_huge_pages_node[nid]; + else + nr_available = h->free_huge_pages; + nr_available -= h->resv_huge_pages; + if (!nr_available) + break; + + rc = demote_pool_huge_page(h, n_mask, nr_demote); + if (rc < 0) { + err = rc; + break; + } + + nr_demote -= rc; + } + + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + + if (err) + return err; + return len; +} +HSTATE_ATTR_WO(demote); + +static ssize_t demote_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; + + return sysfs_emit(buf, "%lukB\n", demote_size); +} + +static ssize_t demote_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hstate *h, *demote_hstate; + unsigned long demote_size; + unsigned int demote_order; + + demote_size = (unsigned long)memparse(buf, NULL); + + demote_hstate = size_to_hstate(demote_size); + if (!demote_hstate) + return -EINVAL; + demote_order = demote_hstate->order; + if (demote_order < HUGETLB_PAGE_ORDER) + return -EINVAL; + + /* demote order must be smaller than hstate order */ + h = kobj_to_hstate(kobj, NULL); + if (demote_order >= h->order) + return -EINVAL; + + /* resize_lock synchronizes access to demote size and writes */ + mutex_lock(&h->resize_lock); + h->demote_order = demote_order; + mutex_unlock(&h->resize_lock); + + return count; +} +HSTATE_ATTR(demote_size); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif + NULL, +}; + +static const struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static struct attribute *hstate_demote_attrs[] = { + &demote_size_attr.attr, + &demote_attr.attr, + NULL, +}; + +static const struct attribute_group hstate_demote_attr_group = { + .attrs = hstate_demote_attrs, +}; + +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + const struct attribute_group *hstate_attr_group) +{ + int retval; + int hi = hstate_index(h); + + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); + if (retval) { + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } + + if (h->demote_order) { + retval = sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group); + if (retval) { + pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } + } + + return 0; +} + +#ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +static struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node devices + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static const struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node device. + * No-op if no hstate attributes attached. + */ +void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) { + int idx = hstate_index(h); + struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; + + if (!hstate_kobj) + continue; + if (h->demote_order) + sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); + sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); + kobject_put(hstate_kobj); + nhs->hstate_kobjs[idx] = NULL; + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + + +/* + * Register hstate attributes for a single node device. + * No-op if attributes already registered. + */ +void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + int err; + + if (!hugetlb_sysfs_initialized) + return; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->dev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", + h->name, node->dev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. + */ +static void __init hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_online_node(nid) + hugetlb_register_node(node_devices[nid]); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_register_all_nodes(void) { } + +#endif + +void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s\n", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} From cdcb53e1deef9bd6ba782645b7297863061c0b4c Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 6 Nov 2025 11:08:23 +0800 Subject: [PATCH 181/321] mm/hugetlb: extract sysctl into hugetlb_sysctl.c Following the extraction of sysfs code, this patch moves the sysctl interface implementation into a dedicated file to further improve code organization and maintainability of the hugetlb subsystem. The following components are moved to mm/hugetlb_sysctl.c: - proc_hugetlb_doulongvec_minmax() - hugetlb_sysctl_handler_common() - hugetlb_sysctl_handler() - hugetlb_mempolicy_sysctl_handler() (CONFIG_NUMA) - hugetlb_overcommit_handler() - hugetlb_table[] sysctl table definition - hugetlb_sysctl_init() The hugetlb_internal.h header file is updated to declare the sysctl initialization function with proper #ifdef guards for configurations without CONFIG_SYSCTL support. The Makefile is updated to compile hugetlb_sysctl.o when CONFIG_HUGETLBFS is enabled. This refactoring reduces the size of hugetlb.c and logically separates the sysctl interface from core hugetlb management code. MAINTAINERS is updated to add new file hugetlb_sysctl.c. No functional changes are introduced; all code is moved as-is from hugetlb.c with consistent formatting. Link: https://lkml.kernel.org/r/5bbee7ab5be71d0bb1aebec38642d7e83526bb7a.1762398359.git.zhuhui@kylinos.cn Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Cc: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + mm/Makefile | 2 +- mm/hugetlb.c | 132 ----------------------------------------- mm/hugetlb_internal.h | 6 ++ mm/hugetlb_sysctl.c | 134 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 142 insertions(+), 133 deletions(-) create mode 100644 mm/hugetlb_sysctl.c diff --git a/MAINTAINERS b/MAINTAINERS index 72870562746b..2ee9963b985b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11540,6 +11540,7 @@ F: mm/hugetlb.c F: mm/hugetlb_cgroup.c F: mm/hugetlb_cma.c F: mm/hugetlb_cma.h +F: mm/hugetlb_sysctl.c F: mm/hugetlb_sysfs.c F: mm/hugetlb_vmemmap.c F: mm/hugetlb_vmemmap.h diff --git a/mm/Makefile b/mm/Makefile index b9edfce6c202..00ceb2418b64 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,7 +78,7 @@ endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o -obj-$(CONFIG_HUGETLBFS) += hugetlb.o hugetlb_sysfs.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o hugetlb_sysfs.o hugetlb_sysctl.o ifdef CONFIG_CMA obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 26b2a319b002..106e61f6e12c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -4111,12 +4110,6 @@ ssize_t __nr_hugepages_store_common(bool obey_mempolicy, return err ? err : len; } -#ifdef CONFIG_SYSCTL -static void hugetlb_sysctl_init(void); -#else -static inline void hugetlb_sysctl_init(void) { } -#endif - static int __init hugetlb_init(void) { int i; @@ -4549,131 +4542,6 @@ static unsigned int allowed_mems_nr(struct hstate *h) return nr; } -#ifdef CONFIG_SYSCTL -static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos, unsigned long *out) -{ - struct ctl_table dup_table; - - /* - * In order to avoid races with __do_proc_doulongvec_minmax(), we - * can duplicate the @table and alter the duplicate of it. - */ - dup_table = *table; - dup_table.data = out; - - return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); -} - -static int hugetlb_sysctl_handler_common(bool obey_mempolicy, - const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct hstate *h = &default_hstate; - unsigned long tmp = h->max_huge_pages; - int ret; - - if (!hugepages_supported()) - return -EOPNOTSUPP; - - ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, - &tmp); - if (ret) - goto out; - - if (write) - ret = __nr_hugepages_store_common(obey_mempolicy, h, - NUMA_NO_NODE, tmp, *length); -out: - return ret; -} - -static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - - return hugetlb_sysctl_handler_common(false, table, write, - buffer, length, ppos); -} - -#ifdef CONFIG_NUMA -static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - return hugetlb_sysctl_handler_common(true, table, write, - buffer, length, ppos); -} -#endif /* CONFIG_NUMA */ - -static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct hstate *h = &default_hstate; - unsigned long tmp; - int ret; - - if (!hugepages_supported()) - return -EOPNOTSUPP; - - tmp = h->nr_overcommit_huge_pages; - - if (write && hstate_is_gigantic_no_runtime(h)) - return -EINVAL; - - ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, - &tmp); - if (ret) - goto out; - - if (write) { - spin_lock_irq(&hugetlb_lock); - h->nr_overcommit_huge_pages = tmp; - spin_unlock_irq(&hugetlb_lock); - } -out: - return ret; -} - -static const struct ctl_table hugetlb_table[] = { - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -}; - -static void __init hugetlb_sysctl_init(void) -{ - register_sysctl_init("vm", hugetlb_table); -} -#endif /* CONFIG_SYSCTL */ - void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; diff --git a/mm/hugetlb_internal.h b/mm/hugetlb_internal.h index 5ea372500de7..1d2f870deccf 100644 --- a/mm/hugetlb_internal.h +++ b/mm/hugetlb_internal.h @@ -108,4 +108,10 @@ extern ssize_t __nr_hugepages_store_common(bool obey_mempolicy, extern void hugetlb_sysfs_init(void) __init; +#ifdef CONFIG_SYSCTL +extern void hugetlb_sysctl_init(void); +#else +static inline void hugetlb_sysctl_init(void) { } +#endif + #endif /* _LINUX_HUGETLB_INTERNAL_H */ diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c new file mode 100644 index 000000000000..bd3077150542 --- /dev/null +++ b/mm/hugetlb_sysctl.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HugeTLB sysfs interfaces. + * (C) Nadia Yvette Chambers, April 2004 + */ + +#include + +#include "hugetlb_internal.h" + +#ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp = h->max_huge_pages; + int ret; + + if (!hugepages_supported()) + return -EOPNOTSUPP; + + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); + if (ret) + goto out; + + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); +out: + return ret; +} + +static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + +static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp; + int ret; + + if (!hugepages_supported()) + return -EOPNOTSUPP; + + tmp = h->nr_overcommit_huge_pages; + + if (write && hstate_is_gigantic_no_runtime(h)) + return -EINVAL; + + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); + if (ret) + goto out; + + if (write) { + spin_lock_irq(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock_irq(&hugetlb_lock); + } +out: + return ret; +} + +static const struct ctl_table hugetlb_table[] = { + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, +}; + +void __init hugetlb_sysctl_init(void) +{ + register_sysctl_init("vm", hugetlb_table); +} +#endif /* CONFIG_SYSCTL */ From 5dba5cc2e0ffa76f2f6c8922a04469dc9602c396 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:43 +0000 Subject: [PATCH 182/321] mm: introduce VM_MAYBE_GUARD and make visible in /proc/$pid/smaps Patch series "introduce VM_MAYBE_GUARD and make it sticky", v4. Currently, guard regions are not visible to users except through /proc/$pid/pagemap, with no explicit visibility at the VMA level. This makes the feature less useful, as it isn't entirely apparent which VMAs may have these entries present, especially when performing actions which walk through memory regions such as those performed by CRIU. This series addresses this issue by introducing the VM_MAYBE_GUARD flag which fulfils this role, updating the smaps logic to display an entry for these. The semantics of this flag are that a guard region MAY be present if set (we cannot be sure, as we can't efficiently track whether an MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if not set the VMA definitely does NOT have any guard regions present. It's problematic to establish this flag without further action, because that means that VMAs with guard regions in them become non-mergeable with adjacent VMAs for no especially good reason. To work around this, this series also introduces the concept of 'sticky' VMA flags - that is flags which: a. if set in one VMA and not in another still permit those VMAs to be merged (if otherwise compatible). b. When they are merged, the resultant VMA must have the flag set. The VMA logic is updated to propagate these flags correctly. Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve an issue with file-backed guard regions - previously these established an anon_vma object for file-backed mappings solely to have vma_needs_copy() correctly propagate guard region mappings to child processes. We introduce a new flag alias VM_COPY_ON_FORK (which currently only specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly for this flag and to copy page tables if it is present, which resolves this issue. Additionally, we add the ability for allow-listed VMA flags to be atomically writable with only mmap/VMA read locks held. The only flag we allow so far is VM_MAYBE_GUARD, which we carefully ensure does not cause any races by being allowed to do so. This allows us to maintain guard region installation as a read-locked operation and not endure the overhead of obtaining a write lock here. Finally we introduce extensive VMA userland tests to assert that the sticky VMA logic behaves correctly as well as guard region self tests to assert that smaps visibility is correctly implemented. This patch (of 9): Currently, if a user needs to determine if guard regions are present in a range, they have to scan all VMAs (or have knowledge of which ones might have guard regions). Since commit 8e2f2aeb8b48 ("fs/proc/task_mmu: add guard region bit to pagemap") and the related commit a516403787e0 ("fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions"), users can use either /proc/$pid/pagemap or the PAGEMAP_SCAN functionality to perform this operation at a virtual address level. This is not ideal, and it gives no visibility at a /proc/$pid/smaps level that guard regions exist in ranges. This patch remedies the situation by establishing a new VMA flag, VM_MAYBE_GUARD, to indicate that a VMA may contain guard regions (it is uncertain because we cannot reasonably determine whether a MADV_GUARD_REMOVE call has removed all of the guard regions in a VMA, and additionally VMAs may change across merge/split). We utilise 0x800 for this flag which makes it available to 32-bit architectures also, a flag that was previously used by VM_DENYWRITE, which was removed in commit 8d0920bde5eb ("mm: remove VM_DENYWRITE") and hasn't bee reused yet. We also update the smaps logic and documentation to identify these VMAs. Another major use of this functionality is that we can use it to identify that we ought to copy page tables on fork. We do not actually implement usage of this flag in mm/madvise.c yet as we need to allow some VMA flags to be applied atomically under mmap/VMA read lock in order to avoid the need to acquire a write lock for this purpose. Link: https://lkml.kernel.org/r/cover.1763460113.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/cf8ef821eba29b6c5b5e138fffe95d6dcabdedb9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 +++-- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 3 +++ include/trace/events/mmflags.h | 1 + mm/memory.c | 4 ++++ tools/testing/vma/vma_internal.h | 1 + 6 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 0b86a8022fa1..8256e857e2d7 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -553,7 +553,7 @@ otherwise. kernel flags associated with the particular virtual memory area in two letter encoded manner. The codes are the following: - == ======================================= + == ============================================================= rd readable wr writeable ex executable @@ -591,7 +591,8 @@ encoded manner. The codes are the following: sl sealed lf lock on fault pages dp always lazily freeable mapping - == ======================================= + gu maybe contains guard regions (if not set, definitely doesn't) + == ============================================================= Note that there is no guarantee that every flag and associated mnemonic will be present in all further kernel releases. Things get changed, the flags may diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc35a0543f01..db16ed91c269 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1146,6 +1146,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_MAYBE_GUARD)] = "gu", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index df9f258a017c..36b9418c00fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -271,6 +271,8 @@ extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif +#define VM_MAYBE_GUARD_BIT 11 + /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h @@ -296,6 +298,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ +#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index aa441f593e9a..a6e5a44c9b42 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -213,6 +213,7 @@ IF_HAVE_PG_ARCH_3(arch_3) {VM_UFFD_MISSING, "uffd_missing" }, \ IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ + {VM_MAYBE_GUARD, "maybe_guard" }, \ {VM_UFFD_WP, "uffd_wp" }, \ {VM_LOCKED, "locked" }, \ {VM_IO, "io" }, \ diff --git a/mm/memory.c b/mm/memory.c index b09de6274da3..d1728d0538d6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1478,6 +1478,10 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (src_vma->anon_vma) return true; + /* Guard regions have modified page tables that require copying. */ + if (src_vma->vm_flags & VM_MAYBE_GUARD) + return true; + /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index c68d382dac81..46acb4df45de 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -56,6 +56,7 @@ extern unsigned long dac_mmap_min_addr; #define VM_MAYEXEC 0x00000040 #define VM_GROWSDOWN 0x00000100 #define VM_PFNMAP 0x00000400 +#define VM_MAYBE_GUARD 0x00000800 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ From 568822502383acd57d7cc1c72ee43932c45a9524 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:44 +0000 Subject: [PATCH 183/321] mm: add atomic VMA flags and set VM_MAYBE_GUARD as such This patch adds the ability to atomically set VMA flags with only the mmap read/VMA read lock held. As this could be hugely problematic for VMA flags in general given that all other accesses are non-atomic and serialised by the mmap/VMA locks, we implement this with a strict allow-list - that is, only designated flags are allowed to do this. We make VM_MAYBE_GUARD one of these flags. Link: https://lkml.kernel.org/r/97e57abed09f2663077ed7a36fb8206e243171a9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 36b9418c00fc..03776aab3837 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -518,6 +518,9 @@ extern unsigned int kobjsize(const void *objp); /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +/* These flags can be updated atomically via VMA/mmap read lock. */ +#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD + /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE @@ -860,6 +863,47 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } +static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, + int bit) +{ + const vm_flags_t mask = BIT(bit); + + /* Only specific flags are permitted */ + if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) + return false; + + return true; +} + +/* + * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific + * valid flags are allowed to do this. + */ +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) +{ + /* mmap read lock/VMA read lock must be held. */ + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); + + if (__vma_flag_atomic_valid(vma, bit)) + set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags)); +} + +/* + * Test for VMA flag atomically. Requires no locks. Only specific valid flags + * are allowed to do this. + * + * This is necessarily racey, so callers must ensure that serialisation is + * achieved through some other means, or that races are permissible. + */ +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit) +{ + if (__vma_flag_atomic_valid(vma, bit)) + return test_bit(bit, &vma->vm_flags); + + return false; +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; From 9119d6c2095bb20292cb9812dd70d37f17e3bd37 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:45 +0000 Subject: [PATCH 184/321] mm: update vma_modify_flags() to handle residual flags, document The vma_modify_*() family of functions each either perform splits, a merge or no changes at all in preparation for the requested modification to occur. When doing so for a VMA flags change, we currently don't account for any flags which may remain (for instance, VM_SOFTDIRTY) despite the requested change in the case that a merge succeeded. This is made more important by subsequent patches which will introduce the concept of sticky VMA flags which rely on this behaviour. This patch fixes this by passing the VMA flags parameter as a pointer and updating it accordingly on merge and updating callers to accommodate for this. Additionally, while we are here, we add kdocs for each of the vma_modify_*() functions, as the fact that the requested modification is not performed is confusing so it is useful to make this abundantly clear. We also update the VMA userland tests to account for this change. Link: https://lkml.kernel.org/r/23b5b549b0eaefb2922625626e58c2a352f3e93c.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- mm/mlock.c | 2 +- mm/mprotect.c | 2 +- mm/mseal.c | 9 +-- mm/vma.c | 56 +++++++++-------- mm/vma.h | 132 ++++++++++++++++++++++++++++++---------- tools/testing/vma/vma.c | 3 +- 7 files changed, 140 insertions(+), 66 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index fb1c86e630b6..0b3280752bfb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -167,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, - range->start, range->end, new_flags); + range->start, range->end, &new_flags); if (IS_ERR(vma)) return PTR_ERR(vma); diff --git a/mm/mlock.c b/mm/mlock.c index bb0776f5ef7c..2f699c3497a5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -478,7 +478,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; - vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; diff --git a/mm/mprotect.c b/mm/mprotect.c index ab4e06cd9a69..db93d3bb1a5e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -756,7 +756,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, newflags &= ~VM_ACCOUNT; } - vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; diff --git a/mm/mseal.c b/mm/mseal.c index e5b205562d2e..ae442683c5c0 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -66,12 +66,13 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { - unsigned long curr_end = MIN(vma->vm_end, end); + const unsigned long curr_end = MIN(vma->vm_end, end); if (!(vma->vm_flags & VM_SEALED)) { - vma = vma_modify_flags(&vmi, prev, vma, - curr_start, curr_end, - vma->vm_flags | VM_SEALED); + vm_flags_t vm_flags = vma->vm_flags | VM_SEALED; + + vma = vma_modify_flags(&vmi, prev, vma, curr_start, + curr_end, &vm_flags); if (IS_ERR(vma)) return PTR_ERR(vma); vm_flags_set(vma, VM_SEALED); diff --git a/mm/vma.c b/mm/vma.c index 0c5e391fe2e2..47469c036a72 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1638,25 +1638,35 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) return vma; } -struct vm_area_struct *vma_modify_flags( - struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t vm_flags) +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + vm_flags_t *vm_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + const vm_flags_t vm_flags = *vm_flags_ptr; + struct vm_area_struct *ret; vmg.vm_flags = vm_flags; - return vma_modify(&vmg); + ret = vma_modify(&vmg); + if (IS_ERR(ret)) + return ret; + + /* + * For a merge to succeed, the flags must match those requested. For + * flags which do not obey typical merge rules (i.e. do not need to + * match), we must let the caller know about them. + */ + if (vmg.state == VMA_MERGE_SUCCESS) + *vm_flags_ptr = ret->vm_flags; + return ret; } -struct vm_area_struct -*vma_modify_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct anon_vma_name *new_name) +struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); @@ -1665,12 +1675,10 @@ struct vm_area_struct return vma_modify(&vmg); } -struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct mempolicy *new_pol) +struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); @@ -1679,14 +1687,10 @@ struct vm_area_struct return vma_modify(&vmg); } -struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, - bool give_up_on_oom) +struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t vm_flags, + struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); diff --git a/mm/vma.h b/mm/vma.h index e912d42c428a..75f1d9c7204b 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -263,47 +263,115 @@ void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); -/* We are about to modify the VMA's flags. */ -__must_check struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, +/** + * vma_modify_flags() - Peform any necessary split/merge in preparation for + * setting VMA flags to *@vm_flags in the range @start to @end contained within + * @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is + * about to be set to. On merge, this will be updated to include any additional + * flags which remain in place. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * In order to account for VMA flags which may persist (e.g. soft-dirty), the + * @vm_flags_ptr parameter points to the requested flags which are then updated + * so the caller, should they overwrite any existing flags, correctly retains + * these. + * + * Returns: A VMA which contains the range @start to @end ready to have its + * flags altered to *@vm_flags. + */ +__must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t vm_flags); + vm_flags_t *vm_flags_ptr); -/* We are about to modify the VMA's anon_name. */ -__must_check struct vm_area_struct -*vma_modify_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct anon_vma_name *new_name); +/** + * vma_modify_name() - Peform any necessary split/merge in preparation for + * setting anonymous VMA name to @new_name in the range @start to @end contained + * within @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @new_name: The anonymous VMA name that the @start to @end range is about to + * be set to. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * Returns: A VMA which contains the range @start to @end ready to have its + * anonymous VMA name changed to @new_name. + */ +__must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct anon_vma_name *new_name); -/* We are about to modify the VMA's memory policy. */ -__must_check struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, +/** + * vma_modify_policy() - Peform any necessary split/merge in preparation for + * setting NUMA policy to @new_pol in the range @start to @end contained + * within @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @new_pol: The NUMA policy that the @start to @end range is about to be set + * to. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * Returns: A VMA which contains the range @start to @end ready to have its + * NUMA policy changed to @new_pol. + */ +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); -/* We are about to modify the VMA's flags and/or uffd context. */ -__must_check struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, - bool give_up_on_oom); +/** + * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for + * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range + * @start to @end contained within @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @vm_flags: The VMA flags that the @start to @end range is about to be set to. + * @new_ctx: The userfaultfd context that the @start to @end range is about to + * be set to. + * @give_up_on_oom: If an out of memory condition occurs on merge, simply give + * up on it and treat the merge as best-effort. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * Returns: A VMA which contains the range @start to @end ready to have its VMA + * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. + */ +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t vm_flags, + struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); -__must_check struct vm_area_struct -*vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); -__must_check struct vm_area_struct -*vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 656e1c75b711..fd37ce3b2628 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -339,6 +339,7 @@ static bool test_simple_modify(void) struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0x1000); + vm_flags_t flags = VM_READ | VM_MAYREAD; ASSERT_FALSE(attach_vma(&mm, init_vma)); @@ -347,7 +348,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, VM_READ | VM_MAYREAD); + 0x1000, 0x2000, &flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); From 64212ba02e66e705cabce188453ba4e61e9d7325 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:46 +0000 Subject: [PATCH 185/321] mm: implement sticky VMA flags It is useful to be able to designate that certain flags are 'sticky', that is, if two VMAs are merged one with a flag of this nature and one without, the merged VMA sets this flag. As a result we ignore these flags for the purposes of determining VMA flag differences between VMAs being considered for merge. This patch therefore updates the VMA merge logic to perform this action, with flags possessing this property being described in the VM_STICKY bitmap. Those flags which ought to be ignored for the purposes of VMA merge are described in the VM_IGNORE_MERGE bitmap, which the VMA merge logic is also updated to use. As part of this change we place VM_SOFTDIRTY in VM_IGNORE_MERGE as it already had this behaviour, alongside VM_STICKY as sticky flags by implication must not disallow merge. Ultimately it seems that we should make VM_SOFTDIRTY a sticky flag in its own right, but this change is out of scope for this series. The only sticky flag designated as such is VM_MAYBE_GUARD, so as a result of this change, once the VMA flag is set upon guard region installation, VMAs with guard ranges will now not have their merge behaviour impacted as a result and can be freely merged with other VMAs without VM_MAYBE_GUARD set. Also update the comments for vma_modify_flags() to directly reference sticky flags now we have established the concept. We also update the VMA userland tests to account for the changes. Link: https://lkml.kernel.org/r/22ad5269f7669d62afb42ce0c79bad70b994c58d.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ mm/vma.c | 28 +++++++++++++++------------- mm/vma.h | 10 ++++------ tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 03776aab3837..fea113d1d723 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -527,6 +527,34 @@ extern unsigned int kobjsize(const void *objp); #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY VM_MAYBE_GUARD + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but + * dirty bit -- the caller should mark merged VMA as dirty. If + * dirty bit won't be excluded from comparison, we increase + * pressure on the memory system forcing the kernel to generate + * new VMAs when old one could be extended instead. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff --git a/mm/vma.c b/mm/vma.c index 47469c036a72..4e21c988054d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -89,15 +89,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - /* - * VM_SOFTDIRTY should not prevent from VMA merging, if we - * match the flags but dirty bit -- the caller should mark - * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressure on the memory system forcing - * the kernel to generate new VMAs when old one could be - * extended instead. - */ - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) return false; if (vma->vm_file != vmg->file) return false; @@ -808,6 +800,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { + vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -900,11 +893,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (merge_right) { vma_start_write(next); vmg->target = next; + sticky_flags |= (next->vm_flags & VM_STICKY); } if (merge_left) { vma_start_write(prev); vmg->target = prev; + sticky_flags |= (prev->vm_flags & VM_STICKY); } if (merge_both) { @@ -974,6 +969,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; + vm_flags_set(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1124,6 +1120,10 @@ int vma_expand(struct vma_merge_struct *vmg) bool remove_next = false; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + vm_flags_t sticky_flags; + + sticky_flags = vmg->vm_flags & VM_STICKY; + sticky_flags |= target->vm_flags & VM_STICKY; VM_WARN_ON_VMG(!target, vmg); @@ -1133,6 +1133,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (next && (target != next) && (vmg->end == next->vm_end)) { int ret; + sticky_flags |= next->vm_flags & VM_STICKY; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); @@ -1159,6 +1160,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (commit_merge(vmg)) goto nomem; + vm_flags_set(target, sticky_flags); return 0; nomem: @@ -1654,9 +1656,9 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, return ret; /* - * For a merge to succeed, the flags must match those requested. For - * flags which do not obey typical merge rules (i.e. do not need to - * match), we must let the caller know about them. + * For a merge to succeed, the flags must match those + * requested. However, sticky flags may have been retained, so propagate + * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) *vm_flags_ptr = ret->vm_flags; @@ -1906,7 +1908,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } diff --git a/mm/vma.h b/mm/vma.h index 75f1d9c7204b..abada6a64c4e 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -273,17 +273,15 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is - * about to be set to. On merge, this will be updated to include any additional - * flags which remain in place. + * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * - * In order to account for VMA flags which may persist (e.g. soft-dirty), the - * @vm_flags_ptr parameter points to the requested flags which are then updated - * so the caller, should they overwrite any existing flags, correctly retains - * these. + * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * to the requested flags which are then updated so the caller, should they + * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its * flags altered to *@vm_flags. diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 46acb4df45de..73c2025777e6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -117,6 +117,34 @@ extern unsigned long dac_mmap_min_addr; #define VM_SEALED VM_NONE #endif +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY VM_MAYBE_GUARD + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but + * dirty bit -- the caller should mark merged VMA as dirty. If + * dirty bit won't be excluded from comparison, we increase + * pressure on the memory system forcing the kernel to generate + * new VMAs when old one could be extended instead. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL From ab04b530e7e8bd5cf9fb0c1ad20e0deee8f569ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:47 +0000 Subject: [PATCH 186/321] mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one Gather all the VMA flags whose presence implies that page tables must be copied on fork into a single bitmap - VM_COPY_ON_FORK - and use this rather than specifying individual flags in vma_needs_copy(). We also add VM_MAYBE_GUARD to this list, as it being set on a VMA implies that there may be metadata contained in the page tables (that is - guard markers) which would will not and cannot be propagated upon fork. This was already being done manually previously in vma_needs_copy(), but this makes it very explicit, alongside VM_PFNMAP, VM_MIXEDMAP and VM_UFFD_WP all of which imply the same. Note that VM_STICKY flags ought generally to be marked VM_COPY_ON_FORK too - because equally a flag being VM_STICKY indicates that the VMA contains metadat that is not propagated by being faulted in - i.e. that the VMA metadata does not fully describe the VMA alone, and thus we must propagate whatever metadata there is on a fork. However, for maximum flexibility, we do not make this necessarily the case here. Link: https://lkml.kernel.org/r/5d41b24e7bc622cda0af92b6d558d7f4c0d1bc8c.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 ++++++++++++++++++++++++++ mm/memory.c | 18 ++++-------------- tools/testing/vma/vma_internal.h | 26 ++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fea113d1d723..af2904aeb163 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -555,6 +555,32 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff --git a/mm/memory.c b/mm/memory.c index d1728d0538d6..27bc457b32c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1463,25 +1463,15 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { + if (src_vma->vm_flags & VM_COPY_ON_FORK) + return true; /* - * Always copy pgtables when dst_vma has uffd-wp enabled even if it's - * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable - * contains uffd-wp protection information, that's something we can't - * retrieve from page cache, and skip copying will lose those info. + * The presence of an anon_vma indicates an anonymous VMA has page + * tables which naturally cannot be reconstituted on page fault. */ - if (userfaultfd_wp(dst_vma)) - return true; - - if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return true; - if (src_vma->anon_vma) return true; - /* Guard regions have modified page tables that require copying. */ - if (src_vma->vm_flags & VM_MAYBE_GUARD) - return true; - /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 73c2025777e6..233819a9e7ee 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -145,6 +145,32 @@ extern unsigned long dac_mmap_min_addr; */ #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL From 49e14dabed7a294427588d4b315f57fbfcab9990 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:48 +0000 Subject: [PATCH 187/321] mm: set the VM_MAYBE_GUARD flag on guard region install Now we have established the VM_MAYBE_GUARD flag and added the capacity to set it atomically, do so upon MADV_GUARD_INSTALL. The places where this flag is used currently and matter are: * VMA merge - performed under mmap/VMA write lock, therefore excluding racing writes. * /proc/$pid/smaps - can race the write, however this isn't meaningful as the flag write is performed at the point of the guard region being established, and thus an smaps reader can't reasonably expect to avoid races. Due to atomicity, a reader will observe either the flag being set or not. Therefore consistency will be maintained. In all other cases the flag being set is irrelevant and atomicity guarantees other flags will be read correctly. Note that non-atomic updates of unrelated flags do not cause an issue with this flag being set atomically, as writes of other flags are performed under mmap/VMA write lock, and these atomic writes are performed under mmap/VMA read lock, which excludes the write, avoiding RMW races. Note that we do not encounter issues with KCSAN by adjusting this flag atomically, as we are only updating a single bit in the flag bitmap and therefore we do not need to annotate these changes. We intentionally set this flag in advance of actually updating the page tables, to ensure that any racing atomic read of this flag will only return false prior to page tables being updated, to allow for serialisation via page table locks. Note that we set vma->anon_vma for anonymous mappings. This is because the expectation for anonymous mappings is that an anon_vma is established should they possess any page table mappings. This is also consistent with what we were doing prior to this patch (unconditionally setting anon_vma on guard region installation). We also need to update retract_page_tables() to ensure that madvise(..., MADV_COLLAPSE) doesn't incorrectly collapse file-backed ranges contain guard regions. This was previously guarded by anon_vma being set to catch MAP_PRIVATE cases, but the introduction of VM_MAYBE_GUARD necessitates that we check this flag instead. We utilise vma_flag_test_atomic() to do so - we first perform an optimistic check, then after the PTE page table lock is held, we can check again safely, as upon guard marker install the flag is set atomically prior to the page table lock being taken to actually apply it. So if the initial check fails either: * Page table retraction acquires page table lock prior to VM_MAYBE_GUARD being set - guard marker installation will be blocked until page table retraction is complete. OR: * Guard marker installation acquires page table lock after setting VM_MAYBE_GUARD, which raced and didn't pick this up in the initial optimistic check, blocking page table retraction until the guard regions are installed - the second VM_MAYBE_GUARD check will prevent page table retraction. Either way we're safe. We refactor the retraction checks into a single file_backed_vma_is_retractable(), there doesn't seem to be any reason that the checks were separated as before. Note that VM_MAYBE_GUARD being set atomically remains correct as vma_needs_copy() is invoked with the mmap and VMA write locks held, excluding any race with madvise_guard_install(). Link: https://lkml.kernel.org/r/e9e9ce95b6ac17497de7f60fc110c7dd9e489e8d.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 71 ++++++++++++++++++++++++++++++++----------------- mm/madvise.c | 22 +++++++++------ 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f6a92958157d..1b5c2e942df9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1717,6 +1717,43 @@ drop_folio: return result; } +/* Can we retract page tables for this file-backed VMA? */ +static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) +{ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. + */ + if (READ_ONCE(vma->anon_vma)) + return false; + + /* + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. + */ + if (userfaultfd_wp(vma)) + return false; + + /* + * If the VMA contains guard regions then we can't collapse it. + * + * This is set atomically on guard marker installation under mmap/VMA + * read lock, and here we may not hold any VMA or mmap lock at all. + * + * This is therefore serialised on the PTE page table lock, which is + * obtained on guard region installation after the flag is set, so this + * check being performed under this lock excludes races. + */ + if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT)) + return false; + + return true; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1731,14 +1768,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl; bool success = false; - /* - * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth removing - * page tables from, as PMD-mapping is likely to be split later. - */ - if (READ_ONCE(vma->anon_vma)) - continue; - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) @@ -1750,14 +1779,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (hpage_collapse_test_exit(mm)) continue; - /* - * When a vma is registered with uffd-wp, we cannot recycle - * the page table because there may be pte markers installed. - * Other vmas can still have the same file mapped hugely, but - * skip this one: it will always be mapped in small page size - * for uffd-wp registered ranges. - */ - if (userfaultfd_wp(vma)) + + if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ @@ -1784,15 +1807,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* - * Huge page lock is still held, so normally the page table - * must remain empty; and we have already skipped anon_vma - * and userfaultfd_wp() vmas. But since the mmap_lock is not - * held, it is still possible for a racing userfaultfd_ioctl() - * to have inserted ptes or markers. Now that we hold ptlock, - * repeating the anon_vma check protects from one category, - * and repeating the userfaultfd_wp() check from another. + * Huge page lock is still held, so normally the page table must + * remain empty; and we have already skipped anon_vma and + * userfaultfd_wp() vmas. But since the mmap_lock is not held, + * it is still possible for a racing userfaultfd_ioctl() or + * madvise() to have inserted ptes or markers. Now that we hold + * ptlock, repeating the retractable checks protects us from + * races against the prior checks. */ - if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { + if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; diff --git a/mm/madvise.c b/mm/madvise.c index 0b3280752bfb..5dbe40be7c65 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1141,15 +1141,21 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) return -EINVAL; /* - * If we install guard markers, then the range is no longer - * empty from a page table perspective and therefore it's - * appropriate to have an anon_vma. - * - * This ensures that on fork, we copy page tables correctly. + * Set atomically under read lock. All pertinent readers will need to + * acquire an mmap/VMA write lock to read it. All remaining readers may + * or may not see the flag set, but we don't care. */ - err = anon_vma_prepare(vma); - if (err) - return err; + vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT); + + /* + * If anonymous and we are establishing page tables the VMA ought to + * have an anon_vma associated with it. + */ + if (vma_is_anonymous(vma)) { + err = anon_vma_prepare(vma); + if (err) + return err; + } /* * Optimistically try to install the guard marker pages first. If any From 29bef05e6d90b6123275159b52a1e520722243cb Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:49 +0000 Subject: [PATCH 188/321] tools/testing/vma: add VMA sticky userland tests Modify existing merge new/existing userland VMA tests to assert that sticky VMA flags behave as expected. We do so by generating every possible permutation of VMAs being manipulated being sticky/not sticky and asserting that VMA flags with this property retain are retained upon merge. Link: https://lkml.kernel.org/r/5e2c7244485867befd052f8afc8188be6a4be670.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 89 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 10 deletions(-) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index fd37ce3b2628..be79ab2ea44b 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -48,6 +48,8 @@ static struct anon_vma dummy_anon_vma; #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) +#define IS_SET(_val, _flags) ((_val & _flags) == _flags) + static struct task_struct __current; struct task_struct *get_current(void) @@ -442,7 +444,7 @@ static bool test_simple_shrink(void) return true; } -static bool test_merge_new(void) +static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; @@ -470,23 +472,32 @@ static bool test_merge_new(void) struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; bool merged; + if (is_sticky) + vm_flags |= VM_STICKY; + /* * 0123456789abc * AA B CC */ vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); ASSERT_NE(vma_a, NULL); + if (a_is_sticky) + vm_flags_set(vma_a, VM_STICKY); /* We give each VMA a single avc so we can test anon_vma duplication. */ INIT_LIST_HEAD(&vma_a->anon_vma_chain); list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); ASSERT_NE(vma_b, NULL); + if (b_is_sticky) + vm_flags_set(vma_b, VM_STICKY); INIT_LIST_HEAD(&vma_b->anon_vma_chain); list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags); ASSERT_NE(vma_c, NULL); + if (c_is_sticky) + vm_flags_set(vma_c, VM_STICKY); INIT_LIST_HEAD(&vma_c->anon_vma_chain); list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); @@ -521,6 +532,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky || b_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to PREVIOUS VMA. @@ -538,6 +551,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -557,6 +572,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky) /* D uses is_sticky. */ + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -575,6 +592,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -593,6 +612,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -610,6 +631,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 1); + if (is_sticky || a_is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Final state. @@ -638,6 +661,20 @@ static bool test_merge_new(void) return true; } +static bool test_merge_new(void) +{ + int i, j, k, l; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 2; l++) + ASSERT_TRUE(__test_merge_new(i, j, k, l)); + + return true; +} + static bool test_vma_merge_special_flags(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; @@ -974,9 +1011,11 @@ static bool test_vma_merge_new_with_close(void) return true; } -static bool test_merge_existing(void) +static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t prev_flags = vm_flags; + vm_flags_t next_flags = vm_flags; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -989,6 +1028,13 @@ static bool test_merge_existing(void) }; struct anon_vma_chain avc = {}; + if (prev_is_sticky) + prev_flags |= VM_STICKY; + if (middle_is_sticky) + vm_flags |= VM_STICKY; + if (next_is_sticky) + next_flags |= VM_STICKY; + /* * Merge right case - partial span. * @@ -1001,7 +1047,7 @@ static bool test_merge_existing(void) */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1019,6 +1065,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma)); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 2); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1034,7 +1082,7 @@ static bool test_merge_existing(void) * NNNNNNN */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1047,6 +1095,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 1); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1061,7 +1111,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPV */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ @@ -1081,6 +1131,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1095,7 +1147,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); @@ -1110,6 +1162,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1124,10 +1178,10 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; @@ -1140,6 +1194,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted prev and next. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1159,9 +1215,9 @@ static bool test_merge_existing(void) * PPPVVVVVNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags); vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags); vmg.prev = vma; @@ -1204,6 +1260,19 @@ static bool test_merge_existing(void) return true; } +static bool test_merge_existing(void) +{ + int i, j, k; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + ASSERT_TRUE(__test_merge_existing(i, j, k)); + + return true; +} + static bool test_anon_vma_non_mergeable(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; From 89330ec89741d12970b513af94fd2c46997ae1db Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:50 +0000 Subject: [PATCH 189/321] tools/testing/selftests/mm: add MADV_COLLAPSE test case To ensure the retract_page_tables() logic functions correctly with the introduction of VM_MAYBE_GUARD, add a test to assert that madvise collapse fails when guard regions are established in the collapsed range in all cases. Unfortunately we cannot differentiate between e.g. CONFIG_READ_ONLY_THP_FOR_FS not being set vs. a file-backed VMA having collapse correctly disallowed, so in each instance we will get an assert pass here. We add an additional check to see whether guard regions are preserved across collapse in case of a bug causing the collapse to succeed, which will give us more data to debug with should this occur in future. Link: https://lkml.kernel.org/r/0748beeb864525b8ddfa51adad7128dd32eb3ac4.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-regions.c | 65 ++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index 8dd81c0a4a5a..c549bcd6160b 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -2138,4 +2138,69 @@ TEST_F(guard_regions, pagemap_scan) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +TEST_F(guard_regions, collapse) +{ + const unsigned long page_size = self->page_size; + const unsigned long size = 2 * HPAGE_SIZE; + const unsigned long num_pages = size / page_size; + char *ptr; + int i; + + /* Need file to be correct size for tests for non-anon. */ + if (variant->backing != ANON_BACKED) + ASSERT_EQ(ftruncate(self->fd, size), 0); + + /* + * We must close and re-open local-file backed as read-only for + * CONFIG_READ_ONLY_THP_FOR_FS to work. + */ + if (variant->backing == LOCAL_FILE_BACKED) { + ASSERT_EQ(close(self->fd), 0); + + self->fd = open(self->path, O_RDONLY); + ASSERT_GE(self->fd, 0); + } + + ptr = mmap_(self, variant, NULL, size, PROT_READ, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Prevent being faulted-in as huge. */ + ASSERT_EQ(madvise(ptr, size, MADV_NOHUGEPAGE), 0); + /* Fault in. */ + ASSERT_EQ(madvise(ptr, size, MADV_POPULATE_READ), 0); + + /* Install guard regions in ever other page. */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + ASSERT_EQ(madvise(ptr_page, page_size, MADV_GUARD_INSTALL), 0); + /* Accesses should now fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } + + /* Allow huge page throughout region. */ + ASSERT_EQ(madvise(ptr, size, MADV_HUGEPAGE), 0); + + /* + * Now collapse the entire region. This should fail in all cases. + * + * The madvise() call will also fail if CONFIG_READ_ONLY_THP_FOR_FS is + * not set for the local file case, but we can't differentiate whether + * this occurred or if the collapse was rightly rejected. + */ + EXPECT_NE(madvise(ptr, size, MADV_COLLAPSE), 0); + + /* + * If we introduce a bug that causes the collapse to succeed, gather + * data on whether guard regions are at least preserved. The test will + * fail at this point in any case. + */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + /* Accesses should still fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } +} + TEST_HARNESS_MAIN From c0ae966fac00bfd31490b6a9bf494d28865ff840 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:51 +0000 Subject: [PATCH 190/321] tools/testing/selftests/mm: add smaps visibility guard region test Assert that we observe guard regions appearing in /proc/$pid/smaps as expected, and when split/merge is performed too (with expected sticky behaviour). Also add handling for file systems which don't sanely handle mmap() VMA merging so we don't incorrectly encounter a test failure in this situation. Link: https://lkml.kernel.org/r/059e62b8c67e55e6d849878206a95ea1d7c1e885.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-regions.c | 120 +++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 5 + tools/testing/selftests/mm/vm_util.h | 1 + 3 files changed, 126 insertions(+) diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index c549bcd6160b..795bf3f39f44 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -94,6 +94,7 @@ static void *mmap_(FIXTURE_DATA(guard_regions) * self, case ANON_BACKED: flags |= MAP_PRIVATE | MAP_ANON; fd = -1; + offset = 0; break; case SHMEM_BACKED: case LOCAL_FILE_BACKED: @@ -260,6 +261,54 @@ static bool is_buf_eq(char *buf, size_t size, char chr) return true; } +/* + * Some file systems have issues with merging due to changing merge-sensitive + * parameters in the .mmap callback, and prior to .mmap_prepare being + * implemented everywhere this will now result in an unexpected failure to + * merge (e.g. - overlayfs). + * + * Perform a simple test to see if the local file system suffers from this, if + * it does then we can skip test logic that assumes local file system merging is + * sane. + */ +static bool local_fs_has_sane_mmap(FIXTURE_DATA(guard_regions) * self, + const FIXTURE_VARIANT(guard_regions) * variant) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr2; + struct procmap_fd procmap; + + if (variant->backing != LOCAL_FILE_BACKED) + return true; + + /* Map 10 pages. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + if (ptr == MAP_FAILED) + return false; + /* Unmap the middle. */ + munmap(&ptr[5 * page_size], page_size); + + /* Map again. */ + ptr2 = mmap_(self, variant, &ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED, 5 * page_size); + + if (ptr2 == MAP_FAILED) + return false; + + /* Now make sure they all merged. */ + if (open_self_procmap(&procmap) != 0) + return false; + if (!find_vma_procmap(&procmap, ptr)) + return false; + if (procmap.query.vma_start != (unsigned long)ptr) + return false; + if (procmap.query.vma_end != (unsigned long)ptr + 10 * page_size) + return false; + close_procmap(&procmap); + + return true; +} + FIXTURE_SETUP(guard_regions) { self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); @@ -2203,4 +2252,75 @@ TEST_F(guard_regions, collapse) } } +TEST_F(guard_regions, smaps) +{ + const unsigned long page_size = self->page_size; + struct procmap_fd procmap; + char *ptr, *ptr2; + int i; + + /* Map a region. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* We shouldn't yet see a guard flag. */ + ASSERT_FALSE(check_vmflag_guard(ptr)); + + /* Install a single guard region. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + + /* Now we should see a guard flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* + * Removing the guard region should not change things because we simply + * cannot accurately track whether a given VMA has had all of its guard + * regions removed. + */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_REMOVE), 0); + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* Install guard regions throughout. */ + for (i = 0; i < 10; i++) { + ASSERT_EQ(madvise(&ptr[i * page_size], page_size, MADV_GUARD_INSTALL), 0); + /* We should always see the guard region flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + } + + /* Split into two VMAs. */ + ASSERT_EQ(munmap(&ptr[4 * page_size], page_size), 0); + + /* Both VMAs should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + ASSERT_TRUE(check_vmflag_guard(&ptr[5 * page_size])); + + /* + * If the local file system is unable to merge VMAs due to having + * unusual characteristics, there is no point in asserting merge + * behaviour. + */ + if (!local_fs_has_sane_mmap(self, variant)) { + TH_LOG("local filesystem does not support sane merging skipping merge test"); + return; + } + + /* Map a fresh VMA between the two split VMAs. */ + ptr2 = mmap_(self, variant, &ptr[4 * page_size], page_size, + PROT_READ | PROT_WRITE, MAP_FIXED, 4 * page_size); + ASSERT_NE(ptr2, MAP_FAILED); + + /* + * Check the procmap to ensure that this VMA merged with the adjacent + * two. The guard region flag is 'sticky' so should not preclude + * merging. + */ + ASSERT_EQ(open_self_procmap(&procmap), 0); + ASSERT_TRUE(find_vma_procmap(&procmap, ptr)); + ASSERT_EQ(procmap.query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap.query.vma_end, (unsigned long)ptr + 10 * page_size); + ASSERT_EQ(close_procmap(&procmap), 0); + /* And, of course, this VMA should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index e33cda301dad..605cb58ea5c3 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -449,6 +449,11 @@ bool check_vmflag_pfnmap(void *addr) return check_vmflag(addr, "pf"); } +bool check_vmflag_guard(void *addr) +{ + return check_vmflag(addr, "gu"); +} + bool softdirty_supported(void) { char *addr; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 26c30fdc0241..a8abdf414d46 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -98,6 +98,7 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); bool check_vmflag_pfnmap(void *addr); +bool check_vmflag_guard(void *addr); int open_procmap(pid_t pid, struct procmap_fd *procmap_out); int query_procmap(struct procmap_fd *procmap); bool find_vma_procmap(struct procmap_fd *procmap, void *address); From 05be0287955970b043a0742e85b6c285dea4f286 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 7 Nov 2025 17:55:36 +0800 Subject: [PATCH 191/321] mm: remove unnecessary __GFP_HIGHMEM in __p*d_alloc_one_*() __{pgd,p4d,pud,pmd,pte}_alloc_one_*() always allocate pages with GFP flag GFP_PGTABLE_KERNEL/GFP_PGTABLE_USER. These two macros are defined as follows: #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) There is no __GFP_HIGHMEM in them, so we needn't to clear __GFP_HIGHMEM explicitly. Link: https://lkml.kernel.org/r/20251109021817.346181-1-chenhuacai@loongson.cn Link: https://lkml.kernel.org/r/20251107095536.3101371-1-chenhuacai@loongson.cn Signed-off-by: Huacai Chen Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Anshuman Khandual Reviewed-by: Kevin Brodsky Cc: Arnd Bergmann Cc: Jan Kara Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index b9d2a7c79b93..57137d3ac159 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -18,8 +18,7 @@ */ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) { - struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & - ~__GFP_HIGHMEM, 0); + struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL, 0); if (!ptdesc) return NULL; @@ -178,7 +177,6 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) @@ -236,7 +234,6 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) @@ -284,7 +281,6 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, order); if (!ptdesc) From bc8e51c05ad50a5a0b02114d3cc94d151a332595 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 7 Nov 2025 15:40:41 -0800 Subject: [PATCH 192/321] mm: memcg: dump memcg protection info on oom or alloc failures Currently kernel dumps memory state on oom and allocation failures. One of the question usually raised on those dumps is why the kernel has not reclaimed the reclaimable memory instead of triggering oom. One potential reason is the usage of memory protection provided by memcg. So, let's also dump the memory protected by the memcg in such reports to ease the debugging. Link: https://lkml.kernel.org/r/20251107234041.3632644-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 13 +++++++++++++ mm/oom_kill.c | 1 + mm/page_alloc.c | 1 + 4 files changed, 20 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8c0f15e5978f..966f7c1a0128 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1764,6 +1764,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1830,6 +1831,10 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return true; } + +static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 025da46d9959..bfc986da3289 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5635,3 +5635,16 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; } + +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + if (!memcg) + memcg = root_mem_cgroup; + + pr_warn("Memory cgroup min protection %lukB -- low protection %lukB", + K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE), + K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE)); +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c145b0feecc1..5eb11fbba704 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -472,6 +472,7 @@ static void dump_header(struct oom_control *oc) if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } + mem_cgroup_show_protected_memory(oc->memcg); if (sysctl_oom_dump_tasks) dump_tasks(oc); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4efda1158b2..26be5734253f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3977,6 +3977,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) filter &= ~SHOW_MEM_FILTER_NODES; __show_mem(filter, nodemask, gfp_zone(gfp_mask)); + mem_cgroup_show_protected_memory(NULL); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) From f4af67ff4fd8c4bcecb0d889652de93a75122f96 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 17:22:57 +0000 Subject: [PATCH 193/321] mm: rename walk_page_range_mm() Patch series "mm: perform guard region install/remove under VMA lock", v2. There is no reason why can't perform guard region operations under the VMA lock, as long we take proper precautions to ensure that we do so in a safe manner. This is fine, as VMA lock acquisition is always best-effort, so if we are unable to do so, we can simply fall back to using the mmap read lock. Doing so will reduce mmap lock contention for callers performing guard region operations and help establish a precedent of trying to use the VMA lock where possible. As part of this change we perform a trivial rename of page walk functions which bypass safety checks (i.e. whether or not mm_walk_ops->install_pte is specified) in order that we can keep naming consistent with the mm walk. This is because we need to expose a VMA-specific walk that still allows us to install PTE entries. This patch (of 2): Make it clear we're referencing an unsafe variant of this function explicitly. This is laying the foundation for exposing more such functions and maintaining a consistent naming scheme. As a part of this change, rename check_ops_valid() to check_ops_safe() for consistency. Link: https://lkml.kernel.org/r/cover.1762795245.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/c684d91464a438d6e31172c9450416a373f10649.1762795245.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Davidlohr Bueso Reviewed-by: Vlastimil Babka Acked-by: SeongJae Park Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- mm/madvise.c | 4 ++-- mm/pagewalk.c | 22 +++++++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 473b29ddf85d..ba471b8f36fc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1649,7 +1649,7 @@ static inline void accept_page(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ -int walk_page_range_mm(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); int walk_page_range_debug(struct mm_struct *mm, unsigned long start, diff --git a/mm/madvise.c b/mm/madvise.c index 5dbe40be7c65..7ed5bedb8f8e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1173,8 +1173,8 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ - err = walk_page_range_mm(vma->vm_mm, range->start, range->end, - &guard_install_walk_ops, &nr_pages); + err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, + range->end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9f91cf85a5be..30ea959bf38c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -452,7 +452,7 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, * We usually restrict the ability to install PTEs, but this functionality is * available to internal memory management code and provided in mm/internal.h. */ -int walk_page_range_mm(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { @@ -518,10 +518,10 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, * This check is performed on all functions which are parameterised by walk * operations and exposed in include/linux/pagewalk.h. * - * Internal memory management code can use the walk_page_range_mm() function to - * be able to use all page walking operations. + * Internal memory management code can use *_unsafe() functions to be able to + * use all page walking operations. */ -static bool check_ops_valid(const struct mm_walk_ops *ops) +static bool check_ops_safe(const struct mm_walk_ops *ops) { /* * The installation of PTEs is solely under the control of memory @@ -579,10 +579,10 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; - return walk_page_range_mm(mm, start, end, ops, private); + return walk_page_range_mm_unsafe(mm, start, end, ops, private); } /** @@ -639,7 +639,7 @@ int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end if (start >= end) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; return walk_pgd_range(start, end, &walk); @@ -678,7 +678,7 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, pgd, private); if (start >= end || !walk.mm) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; /* @@ -709,7 +709,7 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); @@ -729,7 +729,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); @@ -780,7 +780,7 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, unsigned long start_addr, end_addr; int err = 0; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; lockdep_assert_held(&mapping->i_mmap_rwsem); From 2ab7f1bbafc927c69374d45578011c814c26ae2f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 17:22:58 +0000 Subject: [PATCH 194/321] mm/madvise: allow guard page install/remove under VMA lock We only need to keep the page table stable so we can perform this operation under the VMA lock. PTE installation is stabilised via the PTE lock. One caveat is that, if we prepare vma->anon_vma we must hold the mmap read lock. We can account for this by adapting the VMA locking logic to explicitly check for this case and prevent a VMA lock from being acquired should it be the case. This check is safe, as while we might be raced on anon_vma installation, this would simply make the check conservative, there's no way for us to see an anon_vma and then for it to be cleared, as doing so requires the mmap/VMA write lock. We abstract the VMA lock validity logic to is_vma_lock_sufficient() for this purpose, and add prepares_anon_vma() to abstract the anon_vma logic. In order to do this we need to have a way of installing page tables explicitly for an identified VMA, so we export walk_page_range_vma() in an unsafe variant - walk_page_range_vma_unsafe() and use this should the VMA read lock be taken. We additionally update the comments in madvise_guard_install() to more accurately reflect the cases in which the logic may be reattempted, specifically THP huge pages being present. Link: https://lkml.kernel.org/r/cca1edbd99cd1386ad20556d08ebdb356c45ef91.1762795245.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Davidlohr Bueso Reviewed-by: Vlastimil Babka Acked-by: SeongJae Park Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 3 ++ mm/madvise.c | 110 ++++++++++++++++++++++++++++++++++++-------------- mm/pagewalk.c | 17 +++++--- 3 files changed, 94 insertions(+), 36 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index ba471b8f36fc..2bad3971813b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1652,6 +1652,9 @@ static inline void accept_page(struct page *page) int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); diff --git a/mm/madvise.c b/mm/madvise.c index 7ed5bedb8f8e..2a165e9beb5b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1122,18 +1122,17 @@ static int guard_install_set_pte(unsigned long addr, unsigned long next, return 0; } -static const struct mm_walk_ops guard_install_walk_ops = { - .pud_entry = guard_install_pud_entry, - .pmd_entry = guard_install_pmd_entry, - .pte_entry = guard_install_pte_entry, - .install_pte = guard_install_set_pte, - .walk_lock = PGWALK_RDLOCK, -}; - static long madvise_guard_install(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; + struct mm_walk_ops walk_ops = { + .pud_entry = guard_install_pud_entry, + .pmd_entry = guard_install_pmd_entry, + .pte_entry = guard_install_pte_entry, + .install_pte = guard_install_set_pte, + .walk_lock = get_walk_lock(madv_behavior->lock_mode), + }; long err; int i; @@ -1150,8 +1149,14 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) /* * If anonymous and we are establishing page tables the VMA ought to * have an anon_vma associated with it. + * + * We will hold an mmap read lock if this is necessary, this is checked + * as part of the VMA lock logic. */ if (vma_is_anonymous(vma)) { + VM_WARN_ON_ONCE(!vma->anon_vma && + madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK); + err = anon_vma_prepare(vma); if (err) return err; @@ -1159,12 +1164,14 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) /* * Optimistically try to install the guard marker pages first. If any - * non-guard pages are encountered, give up and zap the range before - * trying again. + * non-guard pages or THP huge pages are encountered, give up and zap + * the range before trying again. * * We try a few times before giving up and releasing back to userland to - * loop around, releasing locks in the process to avoid contention. This - * would only happen if there was a great many racing page faults. + * loop around, releasing locks in the process to avoid contention. + * + * This would only happen due to races with e.g. page faults or + * khugepaged. * * In most cases we should simply install the guard markers immediately * with no zap or looping. @@ -1173,8 +1180,13 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ - err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, - range->end, &guard_install_walk_ops, &nr_pages); + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) + err = walk_page_range_vma_unsafe(madv_behavior->vma, + range->start, range->end, &walk_ops, + &nr_pages); + else + err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, + range->end, &walk_ops, &nr_pages); if (err < 0) return err; @@ -1195,8 +1207,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) } /* - * We were unable to install the guard pages due to being raced by page - * faults. This should not happen ordinarily. We return to userspace and + * We were unable to install the guard pages, return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); @@ -1240,17 +1251,16 @@ static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, return 0; } -static const struct mm_walk_ops guard_remove_walk_ops = { - .pud_entry = guard_remove_pud_entry, - .pmd_entry = guard_remove_pmd_entry, - .pte_entry = guard_remove_pte_entry, - .walk_lock = PGWALK_RDLOCK, -}; - static long madvise_guard_remove(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; + struct mm_walk_ops wallk_ops = { + .pud_entry = guard_remove_pud_entry, + .pmd_entry = guard_remove_pmd_entry, + .pte_entry = guard_remove_pte_entry, + .walk_lock = get_walk_lock(madv_behavior->lock_mode), + }; /* * We're ok with removing guards in mlock()'d ranges, as this is a @@ -1260,7 +1270,7 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior) return -EINVAL; return walk_page_range_vma(vma, range->start, range->end, - &guard_remove_walk_ops, NULL); + &wallk_ops, NULL); } #ifdef CONFIG_64BIT @@ -1573,6 +1583,47 @@ static bool process_madvise_remote_valid(int behavior) } } +/* Does this operation invoke anon_vma_prepare()? */ +static bool prepares_anon_vma(int behavior) +{ + switch (behavior) { + case MADV_GUARD_INSTALL: + return true; + default: + return false; + } +} + +/* + * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA + * read lock only now we have a VMA to examine? + */ +static bool is_vma_lock_sufficient(struct vm_area_struct *vma, + struct madvise_behavior *madv_behavior) +{ + /* Must span only a single VMA.*/ + if (madv_behavior->range.end > vma->vm_end) + return false; + /* Remote processes unsupported. */ + if (current->mm != vma->vm_mm) + return false; + /* Userfaultfd unsupported. */ + if (userfaultfd_armed(vma)) + return false; + /* + * anon_vma_prepare() explicitly requires an mmap lock for + * serialisation, so we cannot use a VMA lock in this case. + * + * Note we might race with anon_vma being set, however this makes this + * check overly paranoid which is safe. + */ + if (vma_is_anonymous(vma) && + prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma) + return false; + + return true; +} + /* * Try to acquire a VMA read lock if possible. * @@ -1594,15 +1645,12 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) vma = lock_vma_under_rcu(mm, madv_behavior->range.start); if (!vma) goto take_mmap_read_lock; - /* - * Must span only a single VMA; uffd and remote processes are - * unsupported. - */ - if (madv_behavior->range.end > vma->vm_end || current->mm != mm || - userfaultfd_armed(vma)) { + + if (!is_vma_lock_sufficient(vma, madv_behavior)) { vma_end_read(vma); goto take_mmap_read_lock; } + madv_behavior->vma = vma; return true; @@ -1715,9 +1763,9 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: + return MADVISE_MMAP_READ_LOCK; case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: - return MADVISE_MMAP_READ_LOCK; case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 30ea959bf38c..8a29b7237bc6 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -694,9 +694,8 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, return walk_pgd_range(start, end, &walk); } -int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, - unsigned long end, const struct mm_walk_ops *ops, - void *private) +int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, @@ -709,14 +708,22 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - if (!check_ops_safe(ops)) - return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + if (!check_ops_safe(ops)) + return -EINVAL; + + return walk_page_range_vma_unsafe(vma, start, end, ops, private); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { From 3a47e8771c43bc9775f667b8de35c873975aa42e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 10 Nov 2025 16:44:37 +0800 Subject: [PATCH 195/321] mm: vmstat: correct the comment above preempt_disable_nested() The comment explaining why these parts use preempt_disable_nested() is in __mod_zone_page_state(), not in __mod_node_page_state(), so we should see __mod_zone_page_state(). Just correct it. Link: https://lkml.kernel.org/r/20251110084437.46701-1-qi.zheng@linux.dev Signed-off-by: Qi Zheng Acked-by: Vlastimil Babka Acked-by: Harry Yoo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmstat.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index b53b07aa29e8..65de88cdf40e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -392,7 +392,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, delta >>= PAGE_SHIFT; } - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -438,7 +438,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_inc_return(*p); @@ -461,7 +461,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_inc_return(*p); @@ -494,7 +494,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_dec_return(*p); @@ -517,7 +517,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_dec_return(*p); From 2197bb60f89077603cc580ff752c5cf6388c1099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Nov 2025 20:32:01 +0000 Subject: [PATCH 196/321] mm: add vma_start_write_killable() Patch series "vma_start_write_killable"", v2. When we added the VMA lock, we made a major oversight in not adding a killable variant. That can run us into trouble where a thread takes the VMA lock for read (eg handling a page fault) and then goes out to lunch for an hour (eg doing reclaim). Another thread tries to modify the VMA, taking the mmap_lock for write, then attempts to lock the VMA for write. That blocks on the first thread, and ensures that every other page fault now tries to take the mmap_lock for read. Because everything's in an uninterruptible sleep, we can't kill the task, which makes me angry. This patchset just adds vma_start_write_killable() and converts one caller to use it. Most users are somewhat tricky to convert, so expect follow-up individual patches per call-site which need careful analysis to make sure we've done proper cleanup. This patch (of 2): The vma can be held read-locked for a substantial period of time, eg if memory allocation needs to go into reclaim. It's useful to be able to send fatal signals to threads which are waiting for the write lock. Link: https://lkml.kernel.org/r/20251110203204.1454057-1-willy@infradead.org Link: https://lkml.kernel.org/r/20251110203204.1454057-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Chris Li Cc: Jann Horn Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 9 +++++++- include/linux/mmap_lock.h | 30 ++++++++++++++++++++++++-- mm/mmap_lock.c | 34 ++++++++++++++++++++++-------- tools/testing/vma/vma_internal.h | 8 +++++++ 4 files changed, 69 insertions(+), 12 deletions(-) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index be49e2a269e4..7f2f3e87071d 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -48,7 +48,8 @@ Terminology * **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves as a read/write semaphore in practice. A VMA read lock is obtained via :c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a - write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked + write lock via vma_start_write() or vma_start_write_killable() + (all VMA write locks are unlocked automatically when the mmap write lock is released). To take a VMA write lock you **must** have already acquired an :c:func:`!mmap_write_lock`. * **rmap locks** - When trying to access VMAs through the reverse mapping via a @@ -907,3 +908,9 @@ Stack expansion Stack expansion throws up additional complexities in that we cannot permit there to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`. + +------------------------ +Functions and structures +------------------------ + +.. kernel-doc:: include/linux/mmap_lock.h diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e05da70dc0cb..d53f72dba7fe 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -195,7 +195,8 @@ static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned in return (vma->vm_lock_seq == *mm_lock_seq); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state); /* * Begin writing to a VMA. @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - __vma_start_write(vma, mm_lock_seq); + __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); +} + +/** + * vma_start_write_killable - Begin writing to a VMA. + * @vma: The VMA we are going to modify. + * + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + * + * Context: May sleep while waiting for readers to drop the vma read lock. + * Caller must already hold the mmap_lock for write. + * + * Return: 0 for a successful acquisition. -EINTR if a fatal signal was + * received. + */ +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return 0; + return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -283,6 +307,8 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) { return 0; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 0a0db5849b8e..39f341caf32c 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -45,8 +45,15 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +/* + * Return value: 0 if vma detached, + * 1 if vma attached with no readers, + * -EINTR if signal received, + */ +static inline int __vma_enter_locked(struct vm_area_struct *vma, + bool detaching, int state) { + int err; unsigned int tgt_refcnt = VMA_LOCK_OFFSET; /* Additional refcnt if the vma is attached. */ @@ -58,15 +65,19 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) - return false; + return 0; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); - rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - TASK_UNINTERRUPTIBLE); + state); + if (err) { + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + return err; + } lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - return true; + return 1; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) @@ -75,16 +86,19 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state) { - bool locked; + int locked; /* * __vma_enter_locked() returns false immediately if the vma is not * attached, otherwise it waits until refcnt is indicating that vma * is attached with no readers. */ - locked = __vma_enter_locked(vma, false); + locked = __vma_enter_locked(vma, false, state); + if (locked < 0) + return locked; /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -100,6 +114,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } + + return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); @@ -118,7 +134,7 @@ void vma_mark_detached(struct vm_area_struct *vma) */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true)) { + if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; __vma_exit_locked(vma, &detached); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 233819a9e7ee..73a899ba2686 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -952,6 +952,14 @@ static inline void vma_start_write(struct vm_area_struct *vma) vma->vm_lock_seq++; } +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, From 7370f8e1b3a8b908b2a4a9d5d02970697e9aba62 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Nov 2025 20:32:02 +0000 Subject: [PATCH 197/321] mm: use vma_start_write_killable() in dup_mmap() Allow waiting for the VMA write lock to be interrupted by fatal signals. The explicit check for fatal_signal_pending() can be removed as it is checked during vma_start_write_killable(). Improves the latency of killing the task as we do not wait for the reader to finish before checking for signals. Link: https://lkml.kernel.org/r/20251110203204.1454057-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Chris Li Cc: Jann Horn Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/mmap.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4f51ca644903..dc51680824ec 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1747,7 +1747,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) for_each_vma(vmi, mpnt) { struct file *file; - vma_start_write(mpnt); + retval = vma_start_write_killable(mpnt); + if (retval < 0) + goto loop_out; if (mpnt->vm_flags & VM_DONTCOPY) { retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, mpnt->vm_end, GFP_KERNEL); @@ -1758,14 +1760,6 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) continue; } charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); From 37104286f9390a3da330c299b01cabfb4c98af7c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:00 -0800 Subject: [PATCH 198/321] mm/damon/tests/core-kunit: remove dynamic allocs on damos_test_commit_filter() Patch series "mm/damon/tests: add more tests for online parameters commit". A DAMON feature called parameters "commit" allows DAMON API callers and ABI users to update nearly every DAMON parameter while DAMON is running. This is being used for flexible DAMON use cases such as taking a snapshot of the monitoring results with minimum overhead, or adjusting access-aware system operations (DAMOS) for user-space driven auto-tuning or investigations. Compared to the usefulness of the feature and size of the implementation, the test coverage is pretty small. Only the filter commit part has a single test case, namely damos_test_commit_filter(). Actually, we found and fixed a few bugs of the feature in the past. The single existing test was also added to avoid reintroduction of a found bug. Add more unit tests for the feature. First four patches (1-4) refactor and extend the existing test for DAMOS filter commit for multiple test cases. Next three patches (5-7) add tests for DAMOS quota commit. Next two patches (8 and 9) refactor damos_commit_dests() for ease of code reading and test writing, and implement a new unit test of the function that is being refactored in a test-friendly way. Final two patches (10 and 11) further add new unit tests for damos_commit() and damon_commit_target_regions(). This patch (of 11): damos_test_commit_filter() is dynamically allocating test-purpose DAMOS filters. Allocation failure checks are making the code longer, complicated, and difficult to extend for more test cases. Refactor the code to remove the dynamic allocation. Link: https://lkml.kernel.org/r/20251111184415.141757-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251111184415.141757-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 96a4cd489b39..ae97886137dc 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -499,23 +499,20 @@ static void damos_test_new_filter(struct kunit *test) static void damos_test_commit_filter(struct kunit *test) { - struct damos_filter *src_filter, *dst_filter; + struct damos_filter src_filter = { + .type = DAMOS_FILTER_TYPE_ANON, + .matching = true, + .allow = true}; + struct damos_filter dst_filter = { + .type = DAMOS_FILTER_TYPE_ACTIVE, + .matching = false, + .allow = false, + }; - src_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); - if (!src_filter) - kunit_skip(test, "src filter alloc fail"); - dst_filter = damos_new_filter(DAMOS_FILTER_TYPE_ACTIVE, false, false); - if (!dst_filter) { - damos_destroy_filter(src_filter); - kunit_skip(test, "dst filter alloc fail"); - } - damos_commit_filter(dst_filter, src_filter); - KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type); - KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching); - KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow); - - damos_destroy_filter(src_filter); - damos_destroy_filter(dst_filter); + damos_commit_filter(&dst_filter, &src_filter); + KUNIT_EXPECT_EQ(test, dst_filter.type, src_filter.type); + KUNIT_EXPECT_EQ(test, dst_filter.matching, src_filter.matching); + KUNIT_EXPECT_EQ(test, dst_filter.allow, src_filter.allow); } static void damos_test_filter_out(struct kunit *test) From 1968236f7517ffde240433f2cd84d902ecd11499 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:01 -0800 Subject: [PATCH 199/321] mm/damon/tests/core-kunit: split out damos_test_commit_filter() core logic damos_test_commit_filter() is written for only a single test case. Split out the core logic of damos_test_commit_filter() as a general one so that it can be reused for multiple test cases. Link: https://lkml.kernel.org/r/20251111184415.141757-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index ae97886137dc..31f90cfcabf3 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -497,6 +497,15 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_commit_filter_for(struct kunit *test, + struct damos_filter *dst, struct damos_filter *src) +{ + damos_commit_filter(dst, src); + KUNIT_EXPECT_EQ(test, dst->type, src->type); + KUNIT_EXPECT_EQ(test, dst->matching, src->matching); + KUNIT_EXPECT_EQ(test, dst->allow, src->allow); +} + static void damos_test_commit_filter(struct kunit *test) { struct damos_filter src_filter = { @@ -509,10 +518,7 @@ static void damos_test_commit_filter(struct kunit *test) .allow = false, }; - damos_commit_filter(&dst_filter, &src_filter); - KUNIT_EXPECT_EQ(test, dst_filter.type, src_filter.type); - KUNIT_EXPECT_EQ(test, dst_filter.matching, src_filter.matching); - KUNIT_EXPECT_EQ(test, dst_filter.allow, src_filter.allow); + damos_test_commit_filter_for(test, &dst_filter, &src_filter); } static void damos_test_filter_out(struct kunit *test) From 1b43b7950d5eb3bf6fcc9b206ef7e69c21228ce7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:02 -0800 Subject: [PATCH 200/321] mm/damon/tests/core-kunit: extend damos_test_commit_filter_for() for union fields damos_commit_filter() also updates union fields of 'struct damos_filter'. Extend damos_test_commit_filter_for() to cover the expectations of the union fields. Link: https://lkml.kernel.org/r/20251111184415.141757-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 31f90cfcabf3..5052d8db9657 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -504,6 +504,26 @@ static void damos_test_commit_filter_for(struct kunit *test, KUNIT_EXPECT_EQ(test, dst->type, src->type); KUNIT_EXPECT_EQ(test, dst->matching, src->matching); KUNIT_EXPECT_EQ(test, dst->allow, src->allow); + switch (src->type) { + case DAMOS_FILTER_TYPE_MEMCG: + KUNIT_EXPECT_EQ(test, dst->memcg_id, src->memcg_id); + break; + case DAMOS_FILTER_TYPE_ADDR: + KUNIT_EXPECT_EQ(test, dst->addr_range.start, + src->addr_range.start); + KUNIT_EXPECT_EQ(test, dst->addr_range.end, + src->addr_range.end); + break; + case DAMOS_FILTER_TYPE_TARGET: + KUNIT_EXPECT_EQ(test, dst->target_idx, src->target_idx); + break; + case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: + KUNIT_EXPECT_EQ(test, dst->sz_range.min, src->sz_range.min); + KUNIT_EXPECT_EQ(test, dst->sz_range.max, src->sz_range.max); + break; + default: + break; + } } static void damos_test_commit_filter(struct kunit *test) From 3caf767e21652348235fcfa84858f32a8db60071 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:03 -0800 Subject: [PATCH 201/321] mm/damon/tests/core-kunit: add test cases to damos_test_commit_filter() damos_test_commit_filter() is covering only a single test case. Extend it to cover multiple combinations of inputs. Link: https://lkml.kernel.org/r/20251111184415.141757-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 53 ++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 5052d8db9657..2770972b0a39 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -528,17 +528,58 @@ static void damos_test_commit_filter_for(struct kunit *test, static void damos_test_commit_filter(struct kunit *test) { - struct damos_filter src_filter = { - .type = DAMOS_FILTER_TYPE_ANON, - .matching = true, - .allow = true}; - struct damos_filter dst_filter = { + struct damos_filter dst = { .type = DAMOS_FILTER_TYPE_ACTIVE, .matching = false, .allow = false, }; - damos_test_commit_filter_for(test, &dst_filter, &src_filter); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_ANON, + .matching = true, + .allow = true, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_MEMCG, + .matching = false, + .allow = false, + .memcg_id = 123, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_YOUNG, + .matching = true, + .allow = true, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + .matching = false, + .allow = false, + .sz_range = {.min = 234, .max = 345}, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_UNMAPPED, + .matching = true, + .allow = true, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_ADDR, + .matching = false, + .allow = false, + .addr_range = {.start = 456, .end = 567}, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_TARGET, + .matching = true, + .allow = true, + .target_idx = 6, + }); } static void damos_test_filter_out(struct kunit *test) From 99f89debafc572fb18872ebecb9e35fc917e5ab2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:04 -0800 Subject: [PATCH 202/321] mm/damon/tests/core-kunit: add damos_commit_quota_goal() test Add a new unit test for damos_commit_quota_goal(). Link: https://lkml.kernel.org/r/20251111184415.141757-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 84 +++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 2770972b0a39..0fdf9c7eedc3 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -497,6 +497,89 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_commit_quota_goal_for(struct kunit *test, + struct damos_quota_goal *dst, + struct damos_quota_goal *src) +{ + u64 dst_last_psi_total = 0; + + if (dst->metric == DAMOS_QUOTA_SOME_MEM_PSI_US) + dst_last_psi_total = dst->last_psi_total; + damos_commit_quota_goal(dst, src); + + KUNIT_EXPECT_EQ(test, dst->metric, src->metric); + KUNIT_EXPECT_EQ(test, dst->target_value, src->target_value); + if (src->metric == DAMOS_QUOTA_USER_INPUT) + KUNIT_EXPECT_EQ(test, dst->current_value, src->current_value); + if (dst_last_psi_total && src->metric == DAMOS_QUOTA_SOME_MEM_PSI_US) + KUNIT_EXPECT_EQ(test, dst->last_psi_total, dst_last_psi_total); + switch (dst->metric) { + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + KUNIT_EXPECT_EQ(test, dst->nid, src->nid); + break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: + KUNIT_EXPECT_EQ(test, dst->nid, src->nid); + KUNIT_EXPECT_EQ(test, dst->memcg_id, src->memcg_id); + break; + default: + break; + } +} + +static void damos_test_commit_quota_goal(struct kunit *test) +{ + struct damos_quota_goal dst = { + .metric = DAMOS_QUOTA_SOME_MEM_PSI_US, + .target_value = 1000, + .current_value = 123, + .last_psi_total = 456, + }; + + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 789, + .current_value = 12}); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP, + .target_value = 345, + .current_value = 678, + .nid = 9, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEM_USED_BP, + .target_value = 12, + .current_value = 345, + .nid = 6, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP, + .target_value = 456, + .current_value = 567, + .nid = 6, + .memcg_id = 7, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + .target_value = 890, + .current_value = 901, + .nid = 10, + .memcg_id = 1, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal) { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 789, + .current_value = 12, + }); +} + static void damos_test_commit_filter_for(struct kunit *test, struct damos_filter *dst, struct damos_filter *src) { @@ -782,6 +865,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_commit_quota_goal), KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), From d9adfb8a28e70d71ee7812ff8561d7e82db0de96 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:05 -0800 Subject: [PATCH 203/321] mm/damon/tests/core-kunit: add damos_commit_quota_goals() test Add a new unit test for damos_commit_quota_goals(). Link: https://lkml.kernel.org/r/20251111184415.141757-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 81 +++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 0fdf9c7eedc3..3abf31fb1074 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -580,6 +580,86 @@ static void damos_test_commit_quota_goal(struct kunit *test) }); } +static void damos_test_commit_quota_goals_for(struct kunit *test, + struct damos_quota_goal *dst_goals, int nr_dst_goals, + struct damos_quota_goal *src_goals, int nr_src_goals) +{ + struct damos_quota dst, src; + struct damos_quota_goal *goal, *next; + bool skip = true; + int i; + + INIT_LIST_HEAD(&dst.goals); + INIT_LIST_HEAD(&src.goals); + + for (i = 0; i < nr_dst_goals; i++) { + /* + * When nr_src_goals is smaller than dst_goals, + * damos_commit_quota_goals() will kfree() the dst goals. + * Make it kfree()-able. + */ + goal = damos_new_quota_goal(dst_goals[i].metric, + dst_goals[i].target_value); + if (!goal) + goto out; + damos_add_quota_goal(&dst, goal); + } + skip = false; + for (i = 0; i < nr_src_goals; i++) + damos_add_quota_goal(&src, &src_goals[i]); + + damos_commit_quota_goals(&dst, &src); + + i = 0; + damos_for_each_quota_goal(goal, (&dst)) { + KUNIT_EXPECT_EQ(test, goal->metric, src_goals[i].metric); + KUNIT_EXPECT_EQ(test, goal->target_value, + src_goals[i++].target_value); + } + KUNIT_EXPECT_EQ(test, i, nr_src_goals); + +out: + damos_for_each_quota_goal_safe(goal, next, (&dst)) + damos_destroy_quota_goal(goal); + if (skip) + kunit_skip(test, "goal alloc fail"); +} + +static void damos_test_commit_quota_goals(struct kunit *test) +{ + damos_test_commit_quota_goals_for(test, + (struct damos_quota_goal[]){}, 0, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 123, + }, + }, 1); + damos_test_commit_quota_goals_for(test, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 234, + }, + + }, 1, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 345, + }, + }, 1); + damos_test_commit_quota_goals_for(test, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 456, + }, + + }, 1, + (struct damos_quota_goal[]){}, 0); +} + static void damos_test_commit_filter_for(struct kunit *test, struct damos_filter *dst, struct damos_filter *src) { @@ -866,6 +946,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_commit_quota_goal), + KUNIT_CASE(damos_test_commit_quota_goals), KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), From 11bb980d41682df3af06e36f9baf89e6d459fa4f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:06 -0800 Subject: [PATCH 204/321] mm/damon/tests/core-kunit: add damos_commit_quota() test Add a new unit test for damos_commit_quota(). Link: https://lkml.kernel.org/r/20251111184415.141757-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 3abf31fb1074..546e1a09d801 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -660,6 +660,38 @@ static void damos_test_commit_quota_goals(struct kunit *test) (struct damos_quota_goal[]){}, 0); } +static void damos_test_commit_quota(struct kunit *test) +{ + struct damos_quota dst = { + .reset_interval = 1, + .ms = 2, + .sz = 3, + .weight_sz = 4, + .weight_nr_accesses = 5, + .weight_age = 6, + }; + struct damos_quota src = { + .reset_interval = 7, + .ms = 8, + .sz = 9, + .weight_sz = 10, + .weight_nr_accesses = 11, + .weight_age = 12, + }; + + INIT_LIST_HEAD(&dst.goals); + INIT_LIST_HEAD(&src.goals); + + damos_commit_quota(&dst, &src); + + KUNIT_EXPECT_EQ(test, dst.reset_interval, src.reset_interval); + KUNIT_EXPECT_EQ(test, dst.ms, src.ms); + KUNIT_EXPECT_EQ(test, dst.sz, src.sz); + KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); + KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); + KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); +} + static void damos_test_commit_filter_for(struct kunit *test, struct damos_filter *dst, struct damos_filter *src) { @@ -947,6 +979,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_commit_quota_goal), KUNIT_CASE(damos_test_commit_quota_goals), + KUNIT_CASE(damos_test_commit_quota), KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), From c1cefda77668ddaed56d2f44020e217bd6476951 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:07 -0800 Subject: [PATCH 205/321] mm/damon/core: pass migrate_dests to damos_commit_dests() damos_commit_dests() receives 'struct damos' pointers, while it uses only their ->migrate_dests fields. This makes code unnecessarily difficult to read. It also makes unit tests writing complicated. Refactor the function to receive pointers to the ->migrate_dests fields. Link: https://lkml.kernel.org/r/20251111184415.141757-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/core.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 06ad359024ad..a14cc73c2cab 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1000,36 +1000,32 @@ static void damos_set_filters_default_reject(struct damos *s) damos_filters_default_reject(&s->ops_filters); } -static int damos_commit_dests(struct damos *dst, struct damos *src) +static int damos_commit_dests(struct damos_migrate_dests *dst, + struct damos_migrate_dests *src) { - struct damos_migrate_dests *dst_dests, *src_dests; + if (dst->nr_dests != src->nr_dests) { + kfree(dst->node_id_arr); + kfree(dst->weight_arr); - dst_dests = &dst->migrate_dests; - src_dests = &src->migrate_dests; - - if (dst_dests->nr_dests != src_dests->nr_dests) { - kfree(dst_dests->node_id_arr); - kfree(dst_dests->weight_arr); - - dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests, - sizeof(*dst_dests->node_id_arr), GFP_KERNEL); - if (!dst_dests->node_id_arr) { - dst_dests->weight_arr = NULL; + dst->node_id_arr = kmalloc_array(src->nr_dests, + sizeof(*dst->node_id_arr), GFP_KERNEL); + if (!dst->node_id_arr) { + dst->weight_arr = NULL; return -ENOMEM; } - dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests, - sizeof(*dst_dests->weight_arr), GFP_KERNEL); - if (!dst_dests->weight_arr) { + dst->weight_arr = kmalloc_array(src->nr_dests, + sizeof(*dst->weight_arr), GFP_KERNEL); + if (!dst->weight_arr) { /* ->node_id_arr will be freed by scheme destruction */ return -ENOMEM; } } - dst_dests->nr_dests = src_dests->nr_dests; - for (int i = 0; i < src_dests->nr_dests; i++) { - dst_dests->node_id_arr[i] = src_dests->node_id_arr[i]; - dst_dests->weight_arr[i] = src_dests->weight_arr[i]; + dst->nr_dests = src->nr_dests; + for (int i = 0; i < src->nr_dests; i++) { + dst->node_id_arr[i] = src->node_id_arr[i]; + dst->weight_arr[i] = src->weight_arr[i]; } return 0; @@ -1076,7 +1072,7 @@ static int damos_commit(struct damos *dst, struct damos *src) dst->wmarks = src->wmarks; dst->target_nid = src->target_nid; - err = damos_commit_dests(dst, src); + err = damos_commit_dests(&dst->migrate_dests, &src->migrate_dests); if (err) return err; From eec573b8dd659e4565df8909d4a4f2262e3dde3d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:08 -0800 Subject: [PATCH 206/321] mm/damon/tests/core-kunit: add damos_commit_dests() test Add a new unit test for damos_commit_dests(). Link: https://lkml.kernel.org/r/20251111184415.141757-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 97 +++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 546e1a09d801..10f2aefc71ff 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -692,6 +692,102 @@ static void damos_test_commit_quota(struct kunit *test) KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); } +static int damos_test_help_dests_setup(struct damos_migrate_dests *dests, + unsigned int *node_id_arr, unsigned int *weight_arr, + size_t nr_dests) +{ + size_t i; + + dests->node_id_arr = kmalloc_array(nr_dests, + sizeof(*dests->node_id_arr), GFP_KERNEL); + if (!dests->node_id_arr) + return -ENOMEM; + dests->weight_arr = kmalloc_array(nr_dests, + sizeof(*dests->weight_arr), GFP_KERNEL); + if (!dests->weight_arr) { + kfree(dests->node_id_arr); + dests->node_id_arr = NULL; + return -ENOMEM; + } + + for (i = 0; i < nr_dests; i++) { + dests->node_id_arr[i] = node_id_arr[i]; + dests->weight_arr[i] = weight_arr[i]; + } + dests->nr_dests = nr_dests; + return 0; +} + +static void damos_test_help_dests_free(struct damos_migrate_dests *dests) +{ + kfree(dests->node_id_arr); + kfree(dests->weight_arr); +} + +static void damos_test_commit_dests_for(struct kunit *test, + unsigned int *dst_node_id_arr, unsigned int *dst_weight_arr, + size_t dst_nr_dests, + unsigned int *src_node_id_arr, unsigned int *src_weight_arr, + size_t src_nr_dests) +{ + struct damos_migrate_dests dst = {}, src = {}; + int i, err; + bool skip = true; + + err = damos_test_help_dests_setup(&dst, dst_node_id_arr, + dst_weight_arr, dst_nr_dests); + if (err) + kunit_skip(test, "dests setup fail"); + err = damos_test_help_dests_setup(&src, src_node_id_arr, + src_weight_arr, src_nr_dests); + if (err) { + damos_test_help_dests_free(&dst); + kunit_skip(test, "src setup fail"); + } + err = damos_commit_dests(&dst, &src); + if (err) + goto out; + skip = false; + + KUNIT_EXPECT_EQ(test, dst.nr_dests, src_nr_dests); + for (i = 0; i < dst.nr_dests; i++) { + KUNIT_EXPECT_EQ(test, dst.node_id_arr[i], src_node_id_arr[i]); + KUNIT_EXPECT_EQ(test, dst.weight_arr[i], src_weight_arr[i]); + } + +out: + damos_test_help_dests_free(&dst); + damos_test_help_dests_free(&src); + if (skip) + kunit_skip(test, "skip"); +} + +static void damos_test_commit_dests(struct kunit *test) +{ + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4}, + 3, + (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7}, + 3); + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2}, (unsigned int[]){2, 3}, + 2, + (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7}, + 3); + damos_test_commit_dests_for(test, + NULL, NULL, 0, + (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7}, + 3); + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4}, + 3, + (unsigned int[]){4, 5}, (unsigned int[]){5, 6}, 2); + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4}, + 3, + NULL, NULL, 0); +} + static void damos_test_commit_filter_for(struct kunit *test, struct damos_filter *dst, struct damos_filter *src) { @@ -980,6 +1076,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_commit_quota_goal), KUNIT_CASE(damos_test_commit_quota_goals), KUNIT_CASE(damos_test_commit_quota), + KUNIT_CASE(damos_test_commit_dests), KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), From 299a88f6ec131b54712167a527e4cb9de6013935 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:09 -0800 Subject: [PATCH 207/321] mm/damon/tests/core-kunit: add damos_commit() test Add a new unit test for damos_commit(). Link: https://lkml.kernel.org/r/20251111184415.141757-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 92 +++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 10f2aefc71ff..c71c10f9e059 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -873,6 +873,97 @@ static void damos_test_commit_filter(struct kunit *test) }); } +static void damos_test_help_initailize_scheme(struct damos *scheme) +{ + INIT_LIST_HEAD(&scheme->quota.goals); + INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->ops_filters); +} + +static void damos_test_commit_for(struct kunit *test, struct damos *dst, + struct damos *src) +{ + int err; + + damos_test_help_initailize_scheme(dst); + damos_test_help_initailize_scheme(src); + + err = damos_commit(dst, src); + if (err) + kunit_skip(test, "damos_commit fail"); + + KUNIT_EXPECT_EQ(test, dst->pattern.min_sz_region, + src->pattern.min_sz_region); + KUNIT_EXPECT_EQ(test, dst->pattern.max_sz_region, + src->pattern.max_sz_region); + KUNIT_EXPECT_EQ(test, dst->pattern.min_nr_accesses, + src->pattern.min_nr_accesses); + KUNIT_EXPECT_EQ(test, dst->pattern.max_nr_accesses, + src->pattern.max_nr_accesses); + KUNIT_EXPECT_EQ(test, dst->pattern.min_age_region, + src->pattern.min_age_region); + KUNIT_EXPECT_EQ(test, dst->pattern.max_age_region, + src->pattern.max_age_region); + + KUNIT_EXPECT_EQ(test, dst->action, src->action); + KUNIT_EXPECT_EQ(test, dst->apply_interval_us, src->apply_interval_us); + + KUNIT_EXPECT_EQ(test, dst->wmarks.metric, src->wmarks.metric); + KUNIT_EXPECT_EQ(test, dst->wmarks.interval, src->wmarks.interval); + KUNIT_EXPECT_EQ(test, dst->wmarks.high, src->wmarks.high); + KUNIT_EXPECT_EQ(test, dst->wmarks.mid, src->wmarks.mid); + KUNIT_EXPECT_EQ(test, dst->wmarks.low, src->wmarks.low); + + switch (src->action) { + case DAMOS_MIGRATE_COLD: + case DAMOS_MIGRATE_HOT: + KUNIT_EXPECT_EQ(test, dst->target_nid, src->target_nid); + break; + default: + break; + } +} + +static void damos_test_commit(struct kunit *test) +{ + damos_test_commit_for(test, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 1, 2, 3, 4, 5, 6}, + .action = DAMOS_PAGEOUT, + .apply_interval_us = 1000000, + .wmarks = (struct damos_watermarks){ + DAMOS_WMARK_FREE_MEM_RATE, + 900, 100, 50}, + }, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 2, 3, 4, 5, 6, 7}, + .action = DAMOS_PAGEOUT, + .apply_interval_us = 2000000, + .wmarks = (struct damos_watermarks){ + DAMOS_WMARK_FREE_MEM_RATE, + 800, 50, 30}, + }); + damos_test_commit_for(test, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 1, 2, 3, 4, 5, 6}, + .action = DAMOS_PAGEOUT, + .apply_interval_us = 1000000, + .wmarks = (struct damos_watermarks){ + DAMOS_WMARK_FREE_MEM_RATE, + 900, 100, 50}, + }, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 2, 3, 4, 5, 6, 7}, + .action = DAMOS_MIGRATE_HOT, + .apply_interval_us = 2000000, + .target_nid = 5, + }); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -1078,6 +1169,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_commit_quota), KUNIT_CASE(damos_test_commit_dests), KUNIT_CASE(damos_test_commit_filter), + KUNIT_CASE(damos_test_commit), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), From 603f67eb91e05a41c5d9da0fdc7145a57ce0ca27 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 11 Nov 2025 10:44:10 -0800 Subject: [PATCH 208/321] mm/damon/tests/core-kunit: add damon_commit_target_regions() test Add a new test for damon_commit_target_regions(). Link: https://lkml.kernel.org/r/20251111184415.141757-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 65 +++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index c71c10f9e059..0d2d8cda8631 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -964,6 +964,70 @@ static void damos_test_commit(struct kunit *test) }); } +static struct damon_target *damon_test_help_setup_target( + unsigned long region_start_end[][2], int nr_regions) +{ + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(); + if (!t) + return NULL; + for (i = 0; i < nr_regions; i++) { + r = damon_new_region(region_start_end[i][0], + region_start_end[i][1]); + if (!r) { + damon_free_target(t); + return NULL; + } + damon_add_region(r, t); + } + return t; +} + +static void damon_test_commit_target_regions_for(struct kunit *test, + unsigned long dst_start_end[][2], int nr_dst_regions, + unsigned long src_start_end[][2], int nr_src_regions, + unsigned long expect_start_end[][2], int nr_expect_regions) +{ + struct damon_target *dst_target, *src_target; + struct damon_region *r; + int i; + + dst_target = damon_test_help_setup_target(dst_start_end, nr_dst_regions); + if (!dst_target) + kunit_skip(test, "dst target setup fail"); + src_target = damon_test_help_setup_target(src_start_end, nr_src_regions); + if (!src_target) { + damon_free_target(dst_target); + kunit_skip(test, "src target setup fail"); + } + damon_commit_target_regions(dst_target, src_target, 1); + i = 0; + damon_for_each_region(r, dst_target) { + KUNIT_EXPECT_EQ(test, r->ar.start, expect_start_end[i][0]); + KUNIT_EXPECT_EQ(test, r->ar.end, expect_start_end[i][1]); + i++; + } + KUNIT_EXPECT_EQ(test, damon_nr_regions(dst_target), nr_expect_regions); + KUNIT_EXPECT_EQ(test, i, nr_expect_regions); + damon_free_target(dst_target); + damon_free_target(src_target); +} + +static void damon_test_commit_target_regions(struct kunit *test) +{ + damon_test_commit_target_regions_for(test, + (unsigned long[][2]) {{3, 8}, {8, 10}}, 2, + (unsigned long[][2]) {{4, 6}}, 1, + (unsigned long[][2]) {{4, 6}}, 1); + damon_test_commit_target_regions_for(test, + (unsigned long[][2]) {{3, 8}, {8, 10}}, 2, + (unsigned long[][2]) {}, 0, + (unsigned long[][2]) {{3, 8}, {8, 10}}, 2); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -1170,6 +1234,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_commit_dests), KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_commit), + KUNIT_CASE(damon_test_commit_target_regions), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), From 1ec5d5810b6f17d5c247f1ffcaed0ed3e8e39609 Mon Sep 17 00:00:00 2001 From: Mehdi Ben Hadj Khelifa Date: Tue, 11 Nov 2025 21:54:27 +0100 Subject: [PATCH 209/321] selftests/mm/uffd: remove static address usage in shmem_allocate_area() The current shmem_allocate_area() implementation uses a hardcoded virtual base address (BASE_PMD_ADDR) as a hint for mmap() when creating shmem-backed test areas. This approach is fragile and may fail on systems with ASLR or different virtual memory layouts, where the chosen address is unavailable. Replace the static base address with a dynamically reserved address range obtained via mmap(NULL, ..., PROT_NONE). The memfd-backed areas and their alias are then mapped into that reserved region using MAP_FIXED, preserving the original layout and aliasing semantics while avoiding collisions with unrelated mappings. This change improves robustness and portability of the test suite without altering its behavior or coverage. [mehdi.benhadjkhelifa@gmail.com: make cleanup code more clear, per Mike] Link: https://lkml.kernel.org/r/20251113142050.108638-1-mehdi.benhadjkhelifa@gmail.com Link: https://lkml.kernel.org/r/20251111205739.420009-1-mehdi.benhadjkhelifa@gmail.com Signed-off-by: Mehdi Ben Hadj Khelifa Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: David Hunter Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 994fe8c03923..edd02328f77b 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -10,7 +10,6 @@ uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; -#define BASE_PMD_ADDR ((void *)(1UL << 30)) /* pthread_mutex_t starts at page offset 0 */ pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts) @@ -142,30 +141,37 @@ static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); + size_t region_size = bytes * 2 + hpage_size; - /* TODO: clean this up. Use a static addr is ugly */ - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); + void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (reserve == MAP_FAILED) { + close(mem_fd); + return -errno; + } + + p = reserve; p_alias = p; p_alias += bytes; p_alias += hpage_size; /* Prevent src/dst VMA merge */ - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (*alloc_area == MAP_FAILED) { *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (*alloc_area != p) err("mmap of memfd failed at %p", p); - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) { - munmap(*alloc_area, bytes); *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (area_alias != p_alias) From 8b02baf37311754518dfe78073583db03fbb0c07 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:04 -0800 Subject: [PATCH 210/321] mm/damon: rename damos core filter helpers to have word core Patch series "mm/damon: misc cleanups". Yet another batch of misc cleanups and refactoring for DAMON code, tests, and documents. First two patches (1and 2) rename DAMOS core filters related code for readability. Three following patches (3-5) refactor page table walk callback functions in DAMON, as suggested by Hugh and David, and I promised. Next two patches (6 and 7) refactor DAMON core layer kunit test and sysfs interface selftest to be simple and deduplicated. Final two patches (8 and 9) fix up sphinx and grammatical errors on documents. This patch (of 9): DAMOS filters handled by the core layer are called core filters, while those handled by the ops layer are called ops filters. They share the same type but are managed in different places since core filters are evaluated before the ops filters. They also have different helper functions that depend on their managed places. The helper functions for ops filters have '_ops_' keyword on their name, so it is easy to know they are for ops filters. Meanwhile, the helper functions for core filters are not having the 'core' keyword on their name. This makes it easy to be mistakenly used for ops filters. Actually there was such a bug. To avoid future mistakes from similar confusions, rename DAMOS core filters helper functions to have a keyword 'core' on their names. Link: https://lkml.kernel.org/r/20251112154114.66053-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251112154114.66053-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- .clang-format | 4 ++-- include/linux/damon.h | 4 ++-- mm/damon/core.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.clang-format b/.clang-format index f371a13b4d19..748efbe791ad 100644 --- a/.clang-format +++ b/.clang-format @@ -140,8 +140,8 @@ ForEachMacros: - 'damon_for_each_scheme_safe' - 'damon_for_each_target' - 'damon_for_each_target_safe' - - 'damos_for_each_filter' - - 'damos_for_each_filter_safe' + - 'damos_for_each_core_filter' + - 'damos_for_each_core_filter_safe' - 'damos_for_each_ops_filter' - 'damos_for_each_ops_filter_safe' - 'damos_for_each_quota_goal' diff --git a/include/linux/damon.h b/include/linux/damon.h index f3566b978cdf..6e3db165fe60 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -871,10 +871,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) -#define damos_for_each_filter(f, scheme) \ +#define damos_for_each_core_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->filters, list) -#define damos_for_each_filter_safe(f, next, scheme) \ +#define damos_for_each_core_filter_safe(f, next, scheme) \ list_for_each_entry_safe(f, next, &(scheme)->filters, list) #define damos_for_each_ops_filter(f, scheme) \ diff --git a/mm/damon/core.c b/mm/damon/core.c index a14cc73c2cab..d4cb11ced13f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -450,7 +450,7 @@ void damon_destroy_scheme(struct damos *s) damos_for_each_quota_goal_safe(g, g_next, &s->quota) damos_destroy_quota_goal(g); - damos_for_each_filter_safe(f, next, s) + damos_for_each_core_filter_safe(f, next, s) damos_destroy_filter(f); damos_for_each_ops_filter_safe(f, next, s) @@ -864,12 +864,12 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) return 0; } -static struct damos_filter *damos_nth_filter(int n, struct damos *s) +static struct damos_filter *damos_nth_core_filter(int n, struct damos *s) { struct damos_filter *filter; int i = 0; - damos_for_each_filter(filter, s) { + damos_for_each_core_filter(filter, s) { if (i++ == n) return filter; } @@ -923,15 +923,15 @@ static int damos_commit_core_filters(struct damos *dst, struct damos *src) struct damos_filter *dst_filter, *next, *src_filter, *new_filter; int i = 0, j = 0; - damos_for_each_filter_safe(dst_filter, next, dst) { - src_filter = damos_nth_filter(i++, src); + damos_for_each_core_filter_safe(dst_filter, next, dst) { + src_filter = damos_nth_core_filter(i++, src); if (src_filter) damos_commit_filter(dst_filter, src_filter); else damos_destroy_filter(dst_filter); } - damos_for_each_filter_safe(src_filter, next, src) { + damos_for_each_core_filter_safe(src_filter, next, src) { if (j++ < i) continue; @@ -1767,7 +1767,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, struct damos_filter *filter; s->core_filters_allowed = false; - damos_for_each_filter(filter, s) { + damos_for_each_core_filter(filter, s) { if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; From 53298afe456e62ad2c2dc8bc7aa54bb86a67ba2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:05 -0800 Subject: [PATCH 211/321] mm/damon: rename damos->filters to damos->core_filters DAMOS filters that are handled by the ops layer are linked to damos->ops_filters. Owing to the ops_ prefix on the name, it is easy to understand it is for ops layer handled filters. The other types of filters, which are handled by the core layer, are linked to damos->filters. Because of the name, it is easy to confuse the list is there for not only core layer handled ones but all filters. Avoid such confusions by renaming the field to core_filters. Link: https://lkml.kernel.org/r/20251112154114.66053-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 +++++----- mm/damon/core.c | 6 +++--- mm/damon/tests/core-kunit.h | 4 ++-- .../testing/selftests/damon/drgn_dump_damon_status.py | 8 ++++---- tools/testing/selftests/damon/sysfs.py | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 6e3db165fe60..3813373a9200 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -492,7 +492,7 @@ struct damos_migrate_dests { * @wmarks: Watermarks for automated (in)activation of this scheme. * @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}". * @target_nid: Destination node if @action is "migrate_{hot,cold}". - * @filters: Additional set of &struct damos_filter for &action. + * @core_filters: Additional set of &struct damos_filter for &action. * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. @@ -518,7 +518,7 @@ struct damos_migrate_dests { * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect - * &filters + * &core_filters * * The minimum entity that @action can be applied depends on the underlying * &struct damon_operations. Since it may not be aligned with the core layer @@ -562,7 +562,7 @@ struct damos { struct damos_migrate_dests migrate_dests; }; }; - struct list_head filters; + struct list_head core_filters; struct list_head ops_filters; void *last_applied; struct damos_stat stat; @@ -872,10 +872,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(goal, next, &(quota)->goals, list) #define damos_for_each_core_filter(f, scheme) \ - list_for_each_entry(f, &(scheme)->filters, list) + list_for_each_entry(f, &(scheme)->core_filters, list) #define damos_for_each_core_filter_safe(f, next, scheme) \ - list_for_each_entry_safe(f, next, &(scheme)->filters, list) + list_for_each_entry_safe(f, next, &(scheme)->core_filters, list) #define damos_for_each_ops_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->ops_filters, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index d4cb11ced13f..aedb315b075a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -306,7 +306,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f) if (damos_filter_for_ops(f->type)) list_add_tail(&f->list, &s->ops_filters); else - list_add_tail(&f->list, &s->filters); + list_add_tail(&f->list, &s->core_filters); } static void damos_del_filter(struct damos_filter *f) @@ -397,7 +397,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, */ scheme->next_apply_sis = 0; scheme->walk_completed = false; - INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -995,7 +995,7 @@ static void damos_set_filters_default_reject(struct damos *s) s->core_filters_default_reject = false; else s->core_filters_default_reject = - damos_filters_default_reject(&s->filters); + damos_filters_default_reject(&s->core_filters); s->ops_filters_default_reject = damos_filters_default_reject(&s->ops_filters); } diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 0d2d8cda8631..4380d0312d24 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -876,7 +876,7 @@ static void damos_test_commit_filter(struct kunit *test) static void damos_test_help_initailize_scheme(struct damos *scheme) { INIT_LIST_HEAD(&scheme->quota.goals); - INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); } @@ -1140,7 +1140,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test) struct damos scheme; struct damos_filter *target_filter, *anon_filter; - INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.core_filters); INIT_LIST_HEAD(&scheme.ops_filters); damos_set_filters_default_reject(&scheme); diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index cb4fdbe68acb..5374d18d1fa8 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -175,11 +175,11 @@ def scheme_to_dict(scheme): ['target_nid', int], ['migrate_dests', damos_migrate_dests_to_dict], ]) - filters = [] + core_filters = [] for f in list_for_each_entry( - 'struct damos_filter', scheme.filters.address_of_(), 'list'): - filters.append(damos_filter_to_dict(f)) - dict_['filters'] = filters + 'struct damos_filter', scheme.core_filters.address_of_(), 'list'): + core_filters.append(damos_filter_to_dict(f)) + dict_['core_filters'] = core_filters ops_filters = [] for f in list_for_each_entry( 'struct damos_filter', scheme.ops_filters.address_of_(), 'list'): diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index b34aea0a6775..b4c5ef5c4d69 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -132,7 +132,7 @@ def assert_scheme_committed(scheme, dump): assert_watermarks_committed(scheme.watermarks, dump['wmarks']) # TODO: test filters directory for idx, f in enumerate(scheme.core_filters.filters): - assert_filter_committed(f, dump['filters'][idx]) + assert_filter_committed(f, dump['core_filters'][idx]) for idx, f in enumerate(scheme.ops_filters.filters): assert_filter_committed(f, dump['ops_filters'][idx]) From 96549d56b89744bc4e9e221bd8abf089c9004d29 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:06 -0800 Subject: [PATCH 212/321] mm/damon/vaddr: cleanup using pmd_trans_huge_lock() Three pmd walk functions in vaddr.c are using pmd_trans_huge() and pmd_lock() to handle THPs. Simplify the code by replacing the two function calls with a single pmd_trans_huge_lock() call. Note that this cleanup is not only reducing the lines of code, but also simplifies code execution flows for migration entries case, as kindly explained [1] by Hugh, who suggested this cleanup. [sj@kernel.org: provide lvalue to pmd_present()] Link: https://lkml.kernel.org/r/20251117154415.11041-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251112154114.66053-4-sj@kernel.org Link: https://lore.kernel.org/296c2b3f-6748-158f-b85d-2952165c0588@google.com [1] Signed-off-by: SeongJae Park Suggested-by: Hugh Dickins Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 48 +++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 7e834467b2d8..83d9b09c86a8 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -307,24 +307,16 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; - pmd_t pmde; spinlock_t *ptl; - if (pmd_trans_huge(pmdp_get(pmd))) { - ptl = pmd_lock(walk->mm, pmd); - pmde = pmdp_get(pmd); + ptl = pmd_trans_huge_lock(pmd, walk->vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - if (!pmd_present(pmde)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_trans_huge(pmde)) { + if (pmd_present(pmde)) damon_pmdp_mkold(pmd, walk->vma, addr); - spin_unlock(ptl); - return 0; - } spin_unlock(ptl); + return 0; } pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); @@ -446,21 +438,12 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(pmdp_get(pmd))) { - pmd_t pmde; + ptl = pmd_trans_huge_lock(pmd, walk->vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - ptl = pmd_lock(walk->mm, pmd); - pmde = pmdp_get(pmd); - - if (!pmd_present(pmde)) { - spin_unlock(ptl); - return 0; - } - - if (!pmd_trans_huge(pmde)) { - spin_unlock(ptl); - goto regular_page; - } + if (!pmd_present(pmde)) + goto huge_out; folio = damon_get_folio(pmd_pfn(pmde)); if (!folio) goto huge_out; @@ -474,8 +457,6 @@ huge_out: spin_unlock(ptl); return 0; } - -regular_page: #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); @@ -910,13 +891,10 @@ static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr, int nr; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(*pmd)) { - pmd_t pmde; + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - ptl = pmd_trans_huge_lock(pmd, vma); - if (!ptl) - return 0; - pmde = pmdp_get(pmd); if (!pmd_present(pmde)) goto huge_unlock; From f0eb046cd3cca429dda9ea8c68a527a461a063b9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:07 -0800 Subject: [PATCH 213/321] mm/damon/vaddr: use vm_normal_folio{,_pmd}() instead of damon_get_folio() A few page table walk entry callback functions in vaddr.c uses damon_get_folio() with p{te,md}_pfn() to get the folio, and then put_folio(). Simplify and drop unnecessary folio get/put by using vm_normal_folio() and its friends instead. Note that this cleanup was suggested by David Hildenbrand during a review of another patch series [1] and the patch was updated following the suggestion. This patch further applies the cleanup to DAMON code that merged before the patch. Link: https://lkml.kernel.org/r/20251112154114.66053-5-sj@kernel.org Link: https://lore.kernel.org/0cb3d5a5-683b-4dba-90a8-b45ab83eec53@redhat.com [1] Signed-off-by: SeongJae Park Suggested-by: David Hildenbrand Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 83d9b09c86a8..b9f0c9e3f684 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -444,7 +444,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, if (!pmd_present(pmde)) goto huge_out; - folio = damon_get_folio(pmd_pfn(pmde)); + folio = vm_normal_folio_pmd(walk->vma, addr, pmde); if (!folio) goto huge_out; if (pmd_young(pmde) || !folio_test_idle(folio) || @@ -452,7 +452,6 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, addr)) priv->young = true; *priv->folio_sz = HPAGE_PMD_SIZE; - folio_put(folio); huge_out: spin_unlock(ptl); return 0; @@ -465,14 +464,13 @@ huge_out: ptent = ptep_get(pte); if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(ptent)); + folio = vm_normal_folio(walk->vma, addr, ptent); if (!folio) goto out; if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); - folio_put(folio); out: pte_unmap_unlock(pte, ptl); return 0; @@ -720,18 +718,16 @@ static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, /* Tell page walk code to not split the PMD */ walk->action = ACTION_CONTINUE; - folio = damon_get_folio(pmd_pfn(pmde)); + folio = vm_normal_folio_pmd(walk->vma, addr, pmde); if (!folio) goto unlock; if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) - goto put_folio; + goto unlock; damos_va_migrate_dests_add(folio, walk->vma, addr, dests, migration_lists); -put_folio: - folio_put(folio); unlock: spin_unlock(ptl); return 0; @@ -754,18 +750,15 @@ static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr, if (pte_none(ptent) || !pte_present(ptent)) return 0; - folio = damon_get_folio(pte_pfn(ptent)); + folio = vm_normal_folio(walk->vma, addr, ptent); if (!folio) return 0; if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) - goto put_folio; + return 0; damos_va_migrate_dests_add(folio, walk->vma, addr, dests, migration_lists); - -put_folio: - folio_put(folio); return 0; } From 09efc56a3b1cfda995586ef27ed8d6f8f92ed917 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:08 -0800 Subject: [PATCH 214/321] mm/damon/vaddr: consistently use only pmd_entry for damos_migrate For page table walks, it is usual [1] to have only one pmd entry function. The vaddr.c code for DAMOS_MIGRATE_{HOT,COLD} is not following the pattern. Instead, it uses both pmd and pte entry functions without a special reason. Refactor it to use only the pmd entry function, to make the code under mm/ more consistent. Link: https://lkml.kernel.org/r/20251112154114.66053-6-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: David Hildenbrand Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 86 +++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b9f0c9e3f684..2750c88e7225 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -697,7 +697,6 @@ isolate: list_add(&folio->lru, &migration_lists[i]); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -707,58 +706,49 @@ static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, struct damos_migrate_dests *dests = &s->migrate_dests; struct folio *folio; spinlock_t *ptl; - pmd_t pmde; + pte_t *start_pte, *pte, ptent; + int nr; - ptl = pmd_lock(walk->mm, pmd); - pmde = pmdp_get(pmd); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmd, walk->vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - if (!pmd_present(pmde) || !pmd_trans_huge(pmde)) - goto unlock; + if (!pmd_present(pmde)) + goto huge_out; + folio = vm_normal_folio_pmd(walk->vma, addr, pmde); + if (!folio) + goto huge_out; + if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) + goto huge_out; + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); +huge_out: + spin_unlock(ptl); + return 0; + } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - /* Tell page walk code to not split the PMD */ - walk->action = ACTION_CONTINUE; - - folio = vm_normal_folio_pmd(walk->vma, addr, pmde); - if (!folio) - goto unlock; - - if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) - goto unlock; - - damos_va_migrate_dests_add(folio, walk->vma, addr, dests, - migration_lists); - -unlock: - spin_unlock(ptl); - return 0; -} -#else -#define damos_va_migrate_pmd_entry NULL -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct damos_va_migrate_private *priv = walk->private; - struct list_head *migration_lists = priv->migration_lists; - struct damos *s = priv->scheme; - struct damos_migrate_dests *dests = &s->migrate_dests; - struct folio *folio; - pte_t ptent; - - ptent = ptep_get(pte); - if (pte_none(ptent) || !pte_present(ptent)) + start_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) return 0; - folio = vm_normal_folio(walk->vma, addr, ptent); - if (!folio) - return 0; + for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; + ptent = ptep_get(pte); - if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) - return 0; - - damos_va_migrate_dests_add(folio, walk->vma, addr, dests, - migration_lists); + if (pte_none(ptent) || !pte_present(ptent)) + continue; + folio = vm_normal_folio(walk->vma, addr, ptent); + if (!folio) + continue; + if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) + return 0; + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); + nr = folio_nr_pages(folio); + } + pte_unmap_unlock(start_pte, ptl); return 0; } @@ -824,7 +814,7 @@ static unsigned long damos_va_migrate(struct damon_target *target, struct damos_migrate_dests *dests = &s->migrate_dests; struct mm_walk_ops walk_ops = { .pmd_entry = damos_va_migrate_pmd_entry, - .pte_entry = damos_va_migrate_pte_entry, + .pte_entry = NULL, .walk_lock = PGWALK_RDLOCK, }; From 10e8c7ba64bb692c32a2fc26b30a664ea21d6a8e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:09 -0800 Subject: [PATCH 215/321] mm/damon/tests/core-kunit: remove DAMON_MIN_REGION redefinition A few DAMON core functions including damon_set_regions() were hard-coded to use DAMON_MIN_REGION as their regions management granularity. For simple and human-readable unit tests' expectations, DAMON core layer kunit test re-defines DAMON_MIN_REGION to '1'. A previous patch series [1] has removed the hard-coded part but kept the redefinition and updated related function calls to explicitly use DAMON_MIN_REGION. Remove the unnecessary redefinition and update relevant function calls to pass literals (number '1') instead of the DAMON_MIN_REGION. Link: https://lkml.kernel.org/r/20251112154114.66053-7-sj@kernel.org Link: https://lore.kernel.org/20250828171242.59810-1-sj@kernel.org [1] Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 ---- mm/damon/tests/core-kunit.h | 55 ++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index aedb315b075a..f9fc0375890a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -20,11 +20,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef CONFIG_DAMON_KUNIT_TEST -#undef DAMON_MIN_REGION -#define DAMON_MIN_REGION 1 -#endif - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; static bool running_exclusive_ctxs; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 4380d0312d24..a1eff023e928 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -279,7 +279,7 @@ static void damon_test_split_regions_of(struct kunit *test) kunit_skip(test, "region alloc fail"); } damon_add_region(r, t); - damon_split_regions_of(t, 2, DAMON_MIN_REGION); + damon_split_regions_of(t, 2, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); @@ -292,7 +292,7 @@ static void damon_test_split_regions_of(struct kunit *test) kunit_skip(test, "second region alloc fail"); } damon_add_region(r, t); - damon_split_regions_of(t, 4, DAMON_MIN_REGION); + damon_split_regions_of(t, 4, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); } @@ -373,7 +373,7 @@ static void damon_test_set_regions(struct kunit *test) damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1, DAMON_MIN_REGION); + damon_set_regions(t, &range, 1, 1); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -1037,15 +1037,14 @@ static void damos_test_filter_out(struct kunit *test) f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); if (!f) kunit_skip(test, "filter alloc fail"); - f->addr_range = (struct damon_addr_range){ - .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; + f->addr_range = (struct damon_addr_range){.start = 2, .end = 6}; t = damon_new_target(); if (!t) { damos_destroy_filter(f); kunit_skip(test, "target alloc fail"); } - r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5); + r = damon_new_region(3, 5); if (!r) { damos_destroy_filter(f); damon_free_target(t); @@ -1054,50 +1053,48 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f, 1)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ - r->ar.start = DAMON_MIN_REGION * 1; - r->ar.end = DAMON_MIN_REGION * 2; + r->ar.start = 1; + r->ar.end = 2; KUNIT_EXPECT_FALSE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + damos_filter_match(NULL, t, r, f, 1)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ - r->ar.start = DAMON_MIN_REGION * 6; - r->ar.end = DAMON_MIN_REGION * 8; + r->ar.start = 6; + r->ar.end = 8; KUNIT_EXPECT_FALSE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + damos_filter_match(NULL, t, r, f, 1)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ - r->ar.start = DAMON_MIN_REGION * 1; - r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + r->ar.start = 1; + r->ar.end = 4; + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f, 1)); /* filter should have split the region */ - KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); - KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, r->ar.start, 1); + KUNIT_EXPECT_EQ(test, r->ar.end, 2); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); r2 = damon_next_region(r); - KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2); - KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4); + KUNIT_EXPECT_EQ(test, r2->ar.start, 2); + KUNIT_EXPECT_EQ(test, r2->ar.end, 4); damon_destroy_region(r2, t); /* region started in the range */ - r->ar.start = DAMON_MIN_REGION * 2; - r->ar.end = DAMON_MIN_REGION * 8; + r->ar.start = 2; + r->ar.end = 8; KUNIT_EXPECT_TRUE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + damos_filter_match(NULL, t, r, f, 1)); /* filter should have split the region */ - KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); - KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); + KUNIT_EXPECT_EQ(test, r->ar.start, 2); + KUNIT_EXPECT_EQ(test, r->ar.end, 6); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); r2 = damon_next_region(r); - KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6); - KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8); + KUNIT_EXPECT_EQ(test, r2->ar.start, 6); + KUNIT_EXPECT_EQ(test, r2->ar.end, 8); damon_destroy_region(r2, t); damon_free_target(t); From 675774adbe800b350714ce46e184f48fa101512d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:10 -0800 Subject: [PATCH 216/321] selftests/damon/sysfs.py: merge DAMON status dumping into commitment assertion For each test case, sysfs.py makes changes to DAMON, dumps DAMON internal status and asserts the expectation is met. The dumping part should be the same for all cases, so it is duplicated for each test case. Which means it is easy to make mistakes. Actually a few of those duplicates are not turning DAMON off in case of the dumping failure. It makes following selftests that need to turn DAMON on fails with -EBUSY. Merge the status dumping into commitment assertion with proper dumping failure handling, to deduplicate and avoid the unnecessary following tests failures. Link: https://lkml.kernel.org/r/20251112154114.66053-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 43 ++++++++------------------ 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index b4c5ef5c4d69..9cca71eb0325 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -185,7 +185,15 @@ def assert_ctx_committed(ctx, dump): assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) -def assert_ctxs_committed(ctxs, dump): +def assert_ctxs_committed(kdamonds): + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print(err) + kdamonds.stop() + exit(1) + + ctxs = kdamonds.kdamonds[0].contexts + dump = status['contexts'] assert_true(len(ctxs) == len(dump), 'ctxs length', dump) for idx, ctx in enumerate(ctxs): assert_ctx_committed(ctx, dump[idx]) @@ -202,13 +210,7 @@ def main(): print('kdamond start failed: %s' % err) exit(1) - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - kdamonds.stop() - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) context = _damon_sysfs.DamonCtx( monitoring_attrs=_damon_sysfs.DamonAttrs( @@ -256,12 +258,7 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) # test online commitment of minimum context. context = _damon_sysfs.DamonCtx() @@ -270,12 +267,7 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) kdamonds.stop() @@ -303,17 +295,8 @@ def main(): exit(1) kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True kdamonds.kdamonds[0].commit() - - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - kdamonds.stop() - exit(1) - del kdamonds.kdamonds[0].contexts[0].targets[1] - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) - + assert_ctxs_committed(kdamonds) kdamonds.stop() if __name__ == '__main__': From 7ad58e009dd159d7004592d826749003197f4083 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:11 -0800 Subject: [PATCH 217/321] Docs/mm/damon/maintainer-profile: fix a typo on mm-untable link Commit 0b473f9e6eac ("Docs/mm/damon/maintainer-profile: update for mm-new tree") mistakenly forgot putting a space between a link and the next word. Fix it. Link: https://lkml.kernel.org/r/20251112154114.66053-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index 58a3fb3c5762..f1aed6e55d31 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -57,7 +57,7 @@ Key cycle dates Patches can be sent anytime. Key cycle dates of the `mm-new `_, `mm-unstable -`_and `mm-stable +`_ and `mm-stable `_ trees depend on the memory management subsystem maintainer. From 6e57c1ce81e0c14e4d46add2b8eb27a4b75d7b26 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:12 -0800 Subject: [PATCH 218/321] Docs/mm/damon/maintainer-profile: fix grammatical errors Fix a few grammatical errors on DAMON maintainer-profile. Link: https://lkml.kernel.org/r/20251112154114.66053-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index f1aed6e55d31..e761edada1e9 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -27,8 +27,8 @@ maintainer. Note again the patches for `mm-new tree `_ are queued by the memory management -subsystem maintainer. If the patches requires some patches in `damon/next tree -`_ which not yet merged in mm-new, +subsystem maintainer. If the patches require some patches in `damon/next tree +`_ which have not yet merged in mm-new, please make sure the requirement is clearly specified. Submit checklist addendum @@ -99,5 +99,5 @@ Schedules and reservation status are available at the Google `doc `_. There is also a public Google `calendar `_ -that has the events. Anyone can subscribe it. DAMON maintainer will also -provide periodic reminder to the mailing list (damon@lists.linux.dev). +that has the events. Anyone can subscribe to it. DAMON maintainer will also +provide periodic reminders to the mailing list (damon@lists.linux.dev). From 6707915e030a3258868355f989b80140c1a45bbe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 17 Nov 2025 17:33:38 +0000 Subject: [PATCH 219/321] mm: propagate VM_SOFTDIRTY on merge Patch series "make VM_SOFTDIRTY a sticky VMA flag", v2. Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. Now we have the concept of making VMA flags 'sticky', that is that they both don't prevent merge and, importantly, are propagated to merged VMAs, this seems a sensible alternative to the existing special-casing of VM_SOFTDIRTY. We additionally add a self-test that demonstrates that this logic behaves as expected. This patch (of 2): Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. This is because currently we simply ignore VM_SOFTDIRTY for the purposes of merge, so one VMA may possess the flag and another not, and whichever happens to be the target VMA will be the one upon which the merge is performed which may or may not have VM_SOFTDIRTY set. Now we have the concept of 'sticky' VMA flags, let's make VM_SOFTDIRTY one which solves this issue. Additionally update VMA userland tests to propagate changes. [akpm@linux-foundation.org: update comments, per Lorenzo] Link: https://lkml.kernel.org/r/0019e0b8-ee1e-4359-b5ee-94225cbe5588@lucifer.local Link: https://lkml.kernel.org/r/cover.1763399675.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/955478b5170715c895d1ef3b7f68e0cd77f76868.1763399675.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pedro Falcato Acked-by: Andrey Vagin Reviewed-by: Vlastimil Babka Acked-by: Cyrill Gorcunov Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++++++-------- tools/testing/vma/vma_internal.h | 18 ++++++------------ 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index af2904aeb163..bf660d5b6e97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,28 +532,27 @@ extern unsigned int kobjsize(const void *objp); * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + * * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that * mapped page tables may contain metadata not described by the * VMA and thus any merged VMA may also contain this metadata, * and thus we must make this flag sticky. */ -#define VM_STICKY VM_MAYBE_GUARD +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but - * dirty bit -- the caller should mark merged VMA as dirty. If - * dirty bit won't be excluded from comparison, we increase - * pressure on the memory system forcing the kernel to generate - * new VMAs when old one could be extended instead. - * * VM_STICKY - When merging VMAs, VMA flags must match, unless they are * 'sticky'. If any sticky flags exist in either VMA, we simply * set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +#define VM_IGNORE_MERGE VM_STICKY /* * Flags which should result in page tables being copied on fork. These are diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 73a899ba2686..81b501f51948 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -122,28 +122,22 @@ extern unsigned long dac_mmap_min_addr; * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. */ -#define VM_STICKY VM_MAYBE_GUARD +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but - * dirty bit -- the caller should mark merged VMA as dirty. If - * dirty bit won't be excluded from comparison, we increase - * pressure on the memory system forcing the kernel to generate - * new VMAs when old one could be extended instead. - * * VM_STICKY - When merging VMAs, VMA flags must match, unless they are * 'sticky'. If any sticky flags exist in either VMA, we simply * set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +#define VM_IGNORE_MERGE VM_STICKY /* * Flags which should result in page tables being copied on fork. These are From c7ba92bcfea34f6b4afc744c3b65c8f7420fefe0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 17 Nov 2025 17:33:39 +0000 Subject: [PATCH 220/321] testing/selftests/mm: add soft-dirty merge self-test Assert that we correctly merge VMAs containing VM_SOFTDIRTY flags now that we correctly handle these as sticky. In order to do so, we have to account for the fact the pagemap interface checks soft dirty PTEs and additionally that newly merged VMAs are marked VM_SOFTDIRTY. We do this by using use unfaulted anon VMAs, establishing one and clearing references on that one, before establishing another and merging the two before checking that soft-dirty is propagated as expected. We check that this functions correctly with mremap() and mprotect() as sample cases, because VMA merge of adjacent newly mapped VMAs will automatically be made soft-dirty due to existing logic which does so. We are therefore exercising other means of merging VMAs. Link: https://lkml.kernel.org/r/d5a0f735783fb4f30a604f570ede02ccc5e29be9.1763399675.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrey Vagin Cc: David Hildenbrand (Red Hat) Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/soft-dirty.c | 127 +++++++++++++++++++++++- 1 file changed, 126 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 4ee4db3750c1..c3a9585de98c 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -184,6 +184,130 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) close(test_fd); } +static void test_merge(int pagemap_fd, int pagesize) +{ + char *reserved, *map, *map2; + + /* + * Reserve space for tests: + * + * ---padding to --- + * | avoid adj. | + * v merge v + * |---|---|---|---|---| + * | | 1 | 2 | 3 | | + * |---|---|---|---|---| + */ + reserved = mmap(NULL, 5 * pagesize, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (reserved == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + munmap(reserved, 4 * pagesize); + + /* + * Establish initial VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* This will clear VM_SOFTDIRTY too. */ + clear_softdirty(); + + /* + * Now place a new mapping which will be marked VM_SOFTDIRTY. Away from + * map: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | | 2 | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[3 * pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now remap it immediately adjacent to map, if the merge correctly + * propagates VM_SOFTDIRTY, we should then observe the VMA as a whole + * being marked soft-dirty: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + map2 = mremap(map2, pagesize, pagesize, MREMAP_FIXED | MREMAP_MAYMOVE, + &reserved[2 * pagesize]); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed\n"); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after remap merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after remap merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); + + /* + * Now establish another VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear VM_SOFTDIRTY... */ + clear_softdirty(); + /* ...and establish incompatible adjacent VMA: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | 2 | | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[2 * pagesize], pagesize, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now mprotect() VMA 1 so it's compatible with 2 and therefore merges: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + if (mprotect(map, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC)) + ksft_exit_fail_msg("mprotect failed\n"); + + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after mprotect merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after mprotect merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); +} + static void test_mprotect_anon(int pagemap_fd, int pagesize) { test_mprotect(pagemap_fd, pagesize, true); @@ -204,7 +328,7 @@ int main(int argc, char **argv) if (!softdirty_supported()) ksft_exit_skip("soft-dirty is not support\n"); - ksft_set_plan(15); + ksft_set_plan(19); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); @@ -216,6 +340,7 @@ int main(int argc, char **argv) test_hugepage(pagemap_fd, pagesize); test_mprotect_anon(pagemap_fd, pagesize); test_mprotect_file(pagemap_fd, pagesize); + test_merge(pagemap_fd, pagesize); close(pagemap_fd); From 1452468447da77e846bda51c3a298ce843395d9c Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 28 Oct 2025 14:01:50 +0100 Subject: [PATCH 221/321] KVM: s390: fix missing present bit for gmap puds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For hugetlbs, gmap puds have the present bit set. For normal puds (which point to ptes), the bit is not set. This is in contrast to the normal userspace puds, which always have the bit set for present pmds. This causes issues when ___pte_offset_map() is modified to only check for the present bit. The solution to the problem is simply to always set the present bit for present gmap pmds. Link: https://lkml.kernel.org/r/20251028130150.57379-2-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Link: https://lore.kernel.org/lkml/20251017144924.10034-1-borntraeger@linux.ibm.com/ Tested-by: Christian Borntraeger Acked-by: Christian Borntraeger Acked-by: Balbir Singh Cc: Alexander Gordeev Cc: Alistair Popple Cc: Baolin Wang Cc: Barry Song Cc: Byungchul Park Cc: Danilo Krummrich Cc: Dave Airlie Cc: David Hildenbrand Cc: Dev Jain Cc: Francois Dugast Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Lyude Cc: Mathew Brost Cc: Mika Penttilä Cc: Nico Pache Cc: Oscar Salvador Cc: Rakie Kim Cc: Ralph Campbell Cc: Ryan Roberts Cc: Vasily Gorbik Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/mm/gmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 8ff6bba107e8..22c448b32340 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -599,8 +599,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) | _SEGMENT_ENTRY_GMAP_UC | _SEGMENT_ENTRY; } else - *table = pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS; + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS) + | _SEGMENT_ENTRY; } } else if (*table & _SEGMENT_ENTRY_PROTECT && !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { From d245f9b4ab806733a77e51a218ca7b8bc3135cd9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:52 +1000 Subject: [PATCH 222/321] mm/zone_device: support large zone device private folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: support device-private THP", v7. This patch series introduces support for Transparent Huge Page (THP) migration in zone device-private memory. The implementation enables efficient migration of large folios between system memory and device-private memory Background Current zone device-private memory implementation only supports PAGE_SIZE granularity, leading to: - Increased TLB pressure - Inefficient migration between CPU and device memory This series extends the existing zone device-private infrastructure to support THP, leading to: - Reduced page table overhead - Improved memory bandwidth utilization - Seamless fallback to base pages when needed In my local testing (using lib/test_hmm) and a throughput test, the series shows a 350% improvement in data transfer throughput and a 80% improvement in latency These patches build on the earlier posts by Ralph Campbell [1] Two new flags are added in vma_migration to select and mark compound pages. migrate_vma_setup(), migrate_vma_pages() and migrate_vma_finalize() support migration of these pages when MIGRATE_VMA_SELECT_COMPOUND is passed in as arguments. The series also adds zone device awareness to (m)THP pages along with fault handling of large zone device private pages. page vma walk and the rmap code is also zone device aware. Support has also been added for folios that might need to be split in the middle of migration (when the src and dst do not agree on MIGRATE_PFN_COMPOUND), that occurs when src side of the migration can migrate large pages, but the destination has not been able to allocate large pages. The code supported and used folio_split() when migrating THP pages, this is used when MIGRATE_VMA_SELECT_COMPOUND is not passed as an argument to migrate_vma_setup(). The test infrastructure lib/test_hmm.c has been enhanced to support THP migration. A new ioctl to emulate failure of large page allocations has been added to test the folio split code path. hmm-tests.c has new test cases for huge page migration and to test the folio split path. A new throughput test has been added as well. The nouveau dmem code has been enhanced to use the new THP migration capability. mTHP support: The patches hard code, HPAGE_PMD_NR in a few places, but the code has been kept generic to support various order sizes. With additional refactoring of the code support of different order sizes should be possible. The future plan is to post enhancements to support mTHP with a rough design as follows: 1. Add the notion of allowable thp orders to the HMM based test driver 2. For non PMD based THP paths in migrate_device.c, check to see if a suitable order is found and supported by the driver 3. Iterate across orders to check the highest supported order for migration 4. Migrate and finalize The mTHP patches can be built on top of this series, the key design elements that need to be worked out are infrastructure and driver support for multiple ordered pages and their migration. HMM support for large folios was added in 10b9feee2d0d ("mm/hmm: populate PFNs from PMD swap entry"). This patch (of 16) Add routines to support allocation of large order zone device folios and helper functions for zone device folios, to check if a folio is device private and helpers for setting zone device data. When large folios are used, the existing page_free() callback in pgmap is called when the folio is freed, this is true for both PAGE_SIZE and higher order pages. Zone device private large folios do not support deferred split and scan like normal THP folios. Link: https://lkml.kernel.org/r/20251001065707.920170-1-balbirs@nvidia.com Link: https://lkml.kernel.org/r/20251001065707.920170-2-balbirs@nvidia.com Link: https://lore.kernel.org/linux-mm/20201106005147.20113-1-rcampbell@nvidia.com/ [1] Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Cc: Madhavan Srinivasan Cc: Christophe Leroy Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/drm_pagemap.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 10 ++++++++- lib/test_hmm.c | 2 +- mm/memremap.c | 26 ++++++++++++++---------- mm/rmap.c | 6 +++++- 8 files changed, 34 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 03f8c34fa0a2..91f763410673 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -723,7 +723,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - zone_device_page_init(dpage); + zone_device_page_init(dpage, 0); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 59a5a3fea65d..f6198e66dc5a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -218,7 +218,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - zone_device_page_init(page); + zone_device_page_init(page, 0); } static void diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 22c44807e3fe..46a8edb279dc 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -196,7 +196,7 @@ static void drm_pagemap_get_devmem_page(struct page *page, struct drm_pagemap_zdd *zdd) { page->zone_device_data = drm_pagemap_zdd_get(zdd); - zone_device_page_init(page); + zone_device_page_init(page, 0); } /** diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index ca4932a150e3..53cc1926b9da 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -318,7 +318,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - zone_device_page_init(page); + zone_device_page_init(page, 0); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5951ba12a28..d2487a19cba2 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -206,7 +206,7 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void zone_device_page_init(struct page *page); +void zone_device_page_init(struct page *page, unsigned int order); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -215,6 +215,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); + +static inline void zone_device_folio_init(struct folio *folio, unsigned int order) +{ + zone_device_page_init(&folio->page, order); + if (order) + folio_set_large_rmappable(folio); +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 83e3d8208a54..24d82121cde8 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,7 +627,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } - zone_device_page_init(dpage); + zone_device_page_init(dpage, 0); dpage->zone_device_data = rpage; return dpage; diff --git a/mm/memremap.c b/mm/memremap.c index 46cb1b0b6f72..e45dfb568710 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -416,20 +416,19 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); void free_zone_device_folio(struct folio *folio) { struct dev_pagemap *pgmap = folio->pgmap; + unsigned long nr = folio_nr_pages(folio); + int i; if (WARN_ON_ONCE(!pgmap)) return; mem_cgroup_uncharge(folio); - /* - * Note: we don't expect anonymous compound pages yet. Once supported - * and we could PTE-map them similar to THP, we'd have to clear - * PG_anon_exclusive on all tail pages. - */ if (folio_test_anon(folio)) { - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - __ClearPageAnonExclusive(folio_page(folio, 0)); + for (i = 0; i < nr; i++) + __ClearPageAnonExclusive(folio_page(folio, i)); + } else { + VM_WARN_ON_ONCE(folio_test_large(folio)); } /* @@ -456,8 +455,8 @@ void free_zone_device_folio(struct folio *folio) case MEMORY_DEVICE_COHERENT: if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) break; - pgmap->ops->page_free(folio_page(folio, 0)); - put_dev_pagemap(pgmap); + pgmap->ops->page_free(&folio->page); + percpu_ref_put_many(&folio->pgmap->ref, nr); break; case MEMORY_DEVICE_GENERIC: @@ -480,14 +479,19 @@ void free_zone_device_folio(struct folio *folio) } } -void zone_device_page_init(struct page *page) +void zone_device_page_init(struct page *page, unsigned int order) { + VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES); + /* * Drivers shouldn't be allocating pages after calling * memunmap_pages(). */ - WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref)); + WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order)); set_page_count(page, 1); lock_page(page); + + if (order) + prep_compound_page(page, order); } EXPORT_SYMBOL_GPL(zone_device_page_init); diff --git a/mm/rmap.c b/mm/rmap.c index 3c3cf3efa5f6..eaed5dfbb9b7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1733,9 +1733,13 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * the folio is unmapped and at least one page is still mapped. * * Check partially_mapped first to ensure it is a large folio. + * + * Device private folios do not support deferred splitting and + * shrinker based scanning of the folios to free. */ if (partially_mapped && folio_test_anon(folio) && - !folio_test_partially_mapped(folio)) + !folio_test_partially_mapped(folio) && + !folio_is_device_private(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped); From 3a5a06554566fcc9f7de7327cfc365ed384d396c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:53 +1000 Subject: [PATCH 223/321] mm/zone_device: rename page_free callback to folio_free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change page_free to folio_free to make the folio support for zone device-private more consistent. The PCI P2PDMA callback has also been updated and changed to folio_free() as a result. For drivers that do not support folios (yet), the folio is converted back into page via &folio->page and the page is used as is, in the current callback implementation. Link: https://lkml.kernel.org/r/20251001065707.920170-3-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Cc: Madhavan Srinivasan Cc: Christophe Leroy Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Signed-off-by: Andrew Morton --- Documentation/mm/memory-model.rst | 2 +- arch/powerpc/kvm/book3s_hv_uvmem.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 5 +++-- drivers/gpu/drm/drm_pagemap.c | 10 +++++----- drivers/gpu/drm/nouveau/nouveau_dmem.c | 5 +++-- drivers/pci/p2pdma.c | 5 +++-- include/linux/memremap.h | 6 +++--- lib/test_hmm.c | 5 +++-- mm/memremap.c | 16 ++++++++-------- 9 files changed, 32 insertions(+), 27 deletions(-) diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst index 5f3eafbbc520..7957122039e8 100644 --- a/Documentation/mm/memory-model.rst +++ b/Documentation/mm/memory-model.rst @@ -165,7 +165,7 @@ The users of `ZONE_DEVICE` are: * pmem: Map platform persistent memory to be used as a direct-I/O target via DAX mappings. -* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->folio_free()` event callbacks to allow a device-driver to coordinate memory management events related to device-memory, typically GPU memory. See Documentation/mm/hmm.rst. diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 91f763410673..e5000bef90f2 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -1014,8 +1014,9 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) * to a normal PFN during H_SVM_PAGE_OUT. * Gets called with kvm->arch.uvmem_lock held. */ -static void kvmppc_uvmem_page_free(struct page *page) +static void kvmppc_uvmem_folio_free(struct folio *folio) { + struct page *page = &folio->page; unsigned long pfn = page_to_pfn(page) - (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT); struct kvmppc_uvmem_page_pvt *pvt; @@ -1034,7 +1035,7 @@ static void kvmppc_uvmem_page_free(struct page *page) } static const struct dev_pagemap_ops kvmppc_uvmem_ops = { - .page_free = kvmppc_uvmem_page_free, + .folio_free = kvmppc_uvmem_folio_free, .migrate_to_ram = kvmppc_uvmem_migrate_to_ram, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f6198e66dc5a..6f1617436f4b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -568,8 +568,9 @@ out: return r < 0 ? r : 0; } -static void svm_migrate_page_free(struct page *page) +static void svm_migrate_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct svm_range_bo *svm_bo = page->zone_device_data; if (svm_bo) { @@ -1009,7 +1010,7 @@ out_mmput: } static const struct dev_pagemap_ops svm_migrate_pgmap_ops = { - .page_free = svm_migrate_page_free, + .folio_free = svm_migrate_folio_free, .migrate_to_ram = svm_migrate_to_ram, }; diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 46a8edb279dc..37d7cfbbb3e8 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -752,15 +752,15 @@ err_out: } /** - * drm_pagemap_page_free() - Put GPU SVM zone device data associated with a page - * @page: Pointer to the page + * drm_pagemap_folio_free() - Put GPU SVM zone device data associated with a folio + * @folio: Pointer to the folio * * This function is a callback used to put the GPU SVM zone device data * associated with a page when it is being released. */ -static void drm_pagemap_page_free(struct page *page) +static void drm_pagemap_folio_free(struct folio *folio) { - drm_pagemap_zdd_put(page->zone_device_data); + drm_pagemap_zdd_put(folio->page.zone_device_data); } /** @@ -788,7 +788,7 @@ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf) } static const struct dev_pagemap_ops drm_pagemap_pagemap_ops = { - .page_free = drm_pagemap_page_free, + .folio_free = drm_pagemap_folio_free, .migrate_to_ram = drm_pagemap_migrate_to_ram, }; diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 53cc1926b9da..d34288ebe7d2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -108,8 +108,9 @@ unsigned long nouveau_dmem_page_addr(struct page *page) return chunk->bo->offset + off; } -static void nouveau_dmem_page_free(struct page *page) +static void nouveau_dmem_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); struct nouveau_dmem *dmem = chunk->drm->dmem; @@ -220,7 +221,7 @@ done: } static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { - .page_free = nouveau_dmem_page_free, + .folio_free = nouveau_dmem_folio_free, .migrate_to_ram = nouveau_dmem_migrate_to_ram, }; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 78e108e47254..ee74b75d3e1f 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -200,8 +200,9 @@ static const struct attribute_group p2pmem_group = { .name = "p2pmem", }; -static void p2pdma_page_free(struct page *page) +static void p2pdma_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ struct pci_p2pdma *p2pdma = @@ -214,7 +215,7 @@ static void p2pdma_page_free(struct page *page) } static const struct dev_pagemap_ops p2pdma_pgmap_ops = { - .page_free = p2pdma_page_free, + .folio_free = p2pdma_folio_free, }; static void pci_p2pdma_release(void *data) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d2487a19cba2..cd28d1666801 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -77,11 +77,11 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 0. The reference count will be + * Called once the folio refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare - * for handing out the page again. + * for handing out the folio again. */ - void (*page_free)(struct page *page); + void (*folio_free)(struct folio *folio); /* * Used for private (un-addressable) device memory only. Must migrate diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 24d82121cde8..9dbf265d1036 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1374,8 +1374,9 @@ static const struct file_operations dmirror_fops = { .owner = THIS_MODULE, }; -static void dmirror_devmem_free(struct page *page) +static void dmirror_devmem_free(struct folio *folio) { + struct page *page = &folio->page; struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; @@ -1438,7 +1439,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) } static const struct dev_pagemap_ops dmirror_devmem_ops = { - .page_free = dmirror_devmem_free, + .folio_free = dmirror_devmem_free, .migrate_to_ram = dmirror_devmem_fault, }; diff --git a/mm/memremap.c b/mm/memremap.c index e45dfb568710..4c2e0d68eb27 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -289,8 +289,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } - if (!pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); + if (!pgmap->ops->folio_free) { + WARN(1, "Missing folio_free method\n"); return ERR_PTR(-EINVAL); } if (!pgmap->owner) { @@ -299,8 +299,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_COHERENT: - if (!pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); + if (!pgmap->ops->folio_free) { + WARN(1, "Missing folio_free method\n"); return ERR_PTR(-EINVAL); } if (!pgmap->owner) { @@ -453,9 +453,9 @@ void free_zone_device_folio(struct folio *folio) switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_COHERENT: - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->page_free(&folio->page); + pgmap->ops->folio_free(folio); percpu_ref_put_many(&folio->pgmap->ref, nr); break; @@ -472,9 +472,9 @@ void free_zone_device_folio(struct folio *folio) break; case MEMORY_DEVICE_PCI_P2PDMA: - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->page_free(folio_page(folio, 0)); + pgmap->ops->folio_free(folio); break; } } From 368076f52ebeecd33e10a9f80905d7508b6b6149 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:54 +1000 Subject: [PATCH 224/321] mm/huge_memory: add device-private THP support to PMD operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend core huge page management functions to handle device-private THP entries. This enables proper handling of large device-private folios in fundamental MM operations. The following functions have been updated: - copy_huge_pmd(): Handle device-private entries during fork/clone - zap_huge_pmd(): Properly free device-private THP during munmap - change_huge_pmd(): Support protection changes on device-private THP - __pte_offset_map(): Add device-private entry awareness Link: https://lkml.kernel.org/r/20251001065707.920170-4-balbirs@nvidia.com Signed-off-by: Matthew Brost Signed-off-by: Balbir Singh Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/swapops.h | 32 +++++++++++++++++++++++ mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++------- mm/pgtable-generic.c | 2 +- 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..2687928a8146 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry + * @pmd: The PMD to check + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: 1 if PMD contains device private entry, 0 otherwise + */ +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return 0; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } +static inline int is_pmd_non_present_folio_entry(pmd_t pmd) +{ + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ae16b4a82de..19f0ee7373ae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1704,17 +1704,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (!is_readable_migration_entry(entry)) { - entry = make_readable_migration_entry( - swp_offset(entry)); + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); + + if (is_writable_migration_entry(entry) || + is_readable_exclusive_migration_entry(entry)) { + entry = make_readable_migration_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (is_device_private_entry(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = pfn_swap_entry_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, + dst_vma, src_vma); } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -2212,15 +2240,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (thp_migration_supported()) { + } else if (is_pmd_non_present_folio_entry(orig_pmd)) { swp_entry_t entry; - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); folio = pfn_swap_entry_folio(entry); flush_needed = 0; - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); @@ -2240,6 +2269,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_mark_accessed(folio); } + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); @@ -2368,7 +2403,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so @@ -2381,6 +2416,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); } else { newpmd = *pmd; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e46f0cf2159c..d3aec7a9926a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -292,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval))) goto nomap; From 65edfda6f3f2e58f757485a056e4f1775a1404a8 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:55 +1000 Subject: [PATCH 225/321] mm/rmap: extend rmap and migration support device-private entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add device-private THP support to reverse mapping infrastructure, enabling proper handling during migration and walk operations. The key changes are: - add_migration_pmd()/remove_migration_pmd(): Handle device-private entries during folio migration and splitting - page_vma_mapped_walk(): Recognize device-private THP entries during VMA traversal operations This change supports folio splitting and migration operations on device-private entries. [balbirs@nvidia.com: fix override of entry in remove_migration_pmd] Link: https://lkml.kernel.org/r/20251114012153.2634497-2-balbirs@nvidia.com [balbirs@nvidia.com: follow pattern used in remove_migration_pte()] Link: https://lkml.kernel.org/r/20251115002835.3515194-1-balbirs@nvidia.com Link: https://lkml.kernel.org/r/20251001065707.920170-5-balbirs@nvidia.com Signed-off-by: Balbir Singh Reviewed-by: SeongJae Park Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 20 +++++++++++++++++--- mm/huge_memory.c | 23 ++++++++++++++++++++++- mm/page_idle.c | 7 +++++-- mm/page_vma_mapped.c | 7 +++++++ mm/rmap.c | 24 ++++++++++++++++++++---- 5 files changed, 71 insertions(+), 10 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index f61d6dde13dc..971df8a16ba4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -75,12 +75,24 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd))); + pmd_t pmdval = pmdp_get(pmd); + struct folio *folio; + bool young = false; + unsigned long pfn; + if (likely(pmd_present(pmdval))) + pfn = pmd_pfn(pmdval); + else + pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); + + folio = damon_get_folio(pfn); if (!folio) return; - if (pmdp_clear_young_notify(vma, addr, pmd)) + if (likely(pmd_present(pmdval))) + young |= pmdp_clear_young_notify(vma, addr, pmd); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE); + if (young) folio_set_young(folio); folio_set_idle(folio); @@ -199,7 +211,9 @@ static bool damon_folio_young_one(struct folio *folio, mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - *accessed = pmd_young(pmdp_get(pvmw.pmd)) || + pmd_t pmd = pmdp_get(pvmw.pmd); + + *accessed = (pmd_present(pmd) && pmd_young(pmd)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); #else diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 19f0ee7373ae..f9b6d3d0f643 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4583,7 +4583,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + if (unlikely(!pmd_present(*pvmw->pmd))) + pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd); + else + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); @@ -4633,6 +4636,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); folio_get(folio); pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) @@ -4645,6 +4649,23 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (folio_is_device_private(folio)) { + swp_entry_t entry; + + if (pmd_write(pmde)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); + pmde = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_swp_mksoft_dirty(pmde); + if (pmd_swp_uffd_wp(*pvmw->pmd)) + pmde = pmd_swp_mkuffd_wp(pmde); + } + if (folio_test_anon(folio)) { rmap_t rmap_flags = RMAP_NONE; diff --git a/mm/page_idle.c b/mm/page_idle.c index 9bf573d22e87..96bb94c7b6c3 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -71,8 +71,11 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte); referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) - referenced = true; + pmd_t pmdval = pmdp_get(pvmw.pmd); + + if (likely(pmd_present(pmdval))) + referenced |= pmdp_clear_young_notify(vma, addr, pvmw.pmd); + referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); } else { /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index c498a91b6706..137ce27ff68c 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -277,6 +277,13 @@ restart: * cannot return prematurely, while zap_huge_pmd() has * cleared *pmd but not decremented compound_mapcount(). */ + swp_entry_t entry = pmd_to_swp_entry(pmde); + + if (is_device_private_entry(entry)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + return true; + } + if ((pvmw->flags & PVMW_SYNC) && thp_vma_suitable_order(vma, pvmw->address, PMD_ORDER) && diff --git a/mm/rmap.c b/mm/rmap.c index eaed5dfbb9b7..1954c538a991 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1022,9 +1022,16 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw->pmd; - pmd_t entry; + pmd_t entry = pmdp_get(pmd); - if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) + /* + * Please see the comment above (!pte_present). + * A non present PMD is not writable from a CPU + * perspective. + */ + if (!pmd_present(entry)) + continue; + if (!pmd_dirty(entry) && !pmd_write(entry)) continue; flush_cache_range(vma, address, @@ -2319,6 +2326,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { /* PMD-mapped THP migration entry */ if (!pvmw.pte) { + __maybe_unused unsigned long pfn; + __maybe_unused pmd_t pmdval; + if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, true); @@ -2327,8 +2337,14 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - subpage = folio_page(folio, - pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); + pmdval = pmdp_get(pvmw.pmd); + if (likely(pmd_present(pmdval))) + pfn = pmd_pfn(pmdval); + else + pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); + + subpage = folio_page(folio, pfn - folio_pfn(folio)); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); From 1462872900233e58fb2f9fc8babc24a0d5c03fd9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:56 +1000 Subject: [PATCH 226/321] mm/huge_memory: implement device-private THP splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for splitting device-private THP folios, enabling fallback to smaller page sizes when large page allocation or migration fails. Key changes: - split_huge_pmd(): Handle device-private PMD entries during splitting - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they don't support shared zero page semantics Link: https://lkml.kernel.org/r/20251001065707.920170-6-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- mm/huge_memory.c | 87 +++++++++++++++++++++++++++++++++++++++++------- mm/migrate.c | 1 + 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9b6d3d0f643..23db562cde07 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2842,16 +2842,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; + bool soft_dirty, uffd_wp = false, young = false, write = false; bool anon_exclusive = false, dirty = false; unsigned long addr; pte_t *pte; int i; + swp_entry_t entry; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); + + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2899,20 +2901,51 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - pmd_migration = is_pmd_migration_entry(*pmd); - if (unlikely(pmd_migration)) { - swp_entry_t entry; + if (is_pmd_migration_entry(*pmd)) { old_pmd = *pmd; entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); + folio = page_folio(page); + + soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); + write = is_writable_migration_entry(entry); if (PageAnon(page)) anon_exclusive = is_readable_exclusive_migration_entry(entry); young = is_migration_entry_young(entry); dirty = is_migration_entry_dirty(entry); + } else if (is_pmd_device_private_entry(*pmd)) { + old_pmd = *pmd; + entry = pmd_to_swp_entry(old_pmd); + page = pfn_swap_entry_to_page(entry); + folio = page_folio(page); + soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = is_writable_device_private_entry(entry); + anon_exclusive = PageAnonExclusive(page); + + /* + * Device private THP should be treated the same as regular + * folios w.r.t anon exclusive handling. See the comments for + * folio handling and anon_exclusive below. + */ + if (freeze && anon_exclusive && + folio_try_share_anon_rmap_pmd(folio, page)) + freeze = false; + if (!freeze) { + rmap_t rmap_flags = RMAP_NONE; + + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (anon_exclusive) + rmap_flags |= RMAP_EXCLUSIVE; + + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, + vma, haddr, rmap_flags); + } } else { /* * Up to this point the pmd is present and huge and userland has @@ -2996,11 +3029,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs. */ - if (freeze || pmd_migration) { - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry; - swp_entry_t swp_entry; + if (freeze || is_pmd_migration_entry(old_pmd)) { + pte_t entry; + swp_entry_t swp_entry; + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); @@ -3019,7 +3052,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + set_pte_at(mm, addr, pte + i, entry); + } + } else if (is_pmd_device_private_entry(old_pmd)) { + pte_t entry; + swp_entry_t swp_entry; + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + /* + * anon_exclusive was already propagated to the relevant + * pages corresponding to the pte entries when freeze + * is false. + */ + if (write) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page + i)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page + i)); + /* + * Young and dirty bits are not progated via swp_entry + */ + entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); } @@ -3046,7 +3105,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap(pte); - if (!pmd_migration) + if (!is_pmd_migration_entry(*pmd)) folio_remove_rmap_pmd(folio, page, vma); if (freeze) put_page(page); @@ -3059,7 +3118,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze) { VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); } @@ -3238,6 +3297,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); lockdep_assert_held(&lruvec->lru_lock); + if (folio_is_device_private(folio)) + return; + if (list) { /* page reclaim is reclaiming a huge page */ VM_WARN_ON(folio_test_lru(folio)); @@ -3842,8 +3904,9 @@ fail: if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); - if (!ret && is_anon) + if (!ret && is_anon && !folio_is_device_private(folio)) remap_flags = RMP_USE_SHARED_ZEROPAGE; + remap_page(folio, 1 << order, remap_flags); /* diff --git a/mm/migrate.c b/mm/migrate.c index 08d034dbeb98..d8f6cd14cdb7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -307,6 +307,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(old_pte), page); + VM_WARN_ON_ONCE_FOLIO(folio_is_device_private(folio), folio); if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || mm_forbids_zeropage(pvmw->vma->vm_mm)) From 022a12deda53c983755c08e073a3c028a6850a23 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:57 +1000 Subject: [PATCH 227/321] mm/migrate_device: handle partially mapped folios during collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend migrate_vma_collect_pmd() to handle partially mapped large folios that require splitting before migration can proceed. During PTE walk in the collection phase, if a large folio is only partially mapped in the migration range, it must be split to ensure the folio is correctly migrated. [matthew.brost@intel.com: handle partially mapped folios during split] Link: https://lkml.kernel.org/r/20251120230825.181072-1-matthew.brost@intel.com Link: https://lkml.kernel.org/r/20251001065707.920170-7-balbirs@nvidia.com Signed-off-by: Balbir Singh Signed-off-by: Matthew Brost Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- mm/migrate_device.c | 70 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index abd9f6850db6..e6bcd6dc5129 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -54,6 +54,55 @@ static int migrate_vma_collect_hole(unsigned long start, return 0; } +/** + * migrate_vma_split_folio() - Helper function to split a THP folio + * @folio: the folio to split + * @fault_page: struct page associated with the fault if any + * + * Returns 0 on success + */ +static int migrate_vma_split_folio(struct folio *folio, + struct page *fault_page) +{ + int ret; + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL; + struct folio *new_fault_folio = NULL; + + if (folio != fault_folio) { + folio_get(folio); + folio_lock(folio); + } + + ret = split_folio(folio); + if (ret) { + if (folio != fault_folio) { + folio_unlock(folio); + folio_put(folio); + } + return ret; + } + + new_fault_folio = fault_page ? page_folio(fault_page) : NULL; + + /* + * Ensure the lock is held on the correct + * folio after the split + */ + if (!new_fault_folio) { + folio_unlock(folio); + folio_put(folio); + } else if (folio != new_fault_folio) { + if (new_fault_folio != fault_folio) { + folio_get(new_fault_folio); + folio_lock(new_fault_folio); + } + folio_unlock(folio); + folio_put(folio); + } + + return 0; +} + static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, @@ -107,10 +156,11 @@ again: } } - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); if (!ptep) goto again; arch_enter_lazy_mmu_mode(); + ptep += (addr - start) / PAGE_SIZE; for (; addr < end; addr += PAGE_SIZE, ptep++) { struct dev_pagemap *pgmap; @@ -171,6 +221,24 @@ again: pgmap->owner != migrate->pgmap_owner) goto next; } + folio = page ? page_folio(page) : NULL; + if (folio && folio_test_large(folio)) { + int ret; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return migrate_vma_collect_skip(addr, end, walk); + } + + goto again; + } mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } From a30b48bf1b244f11bf9b6d20cdccfe0c2264130c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:58 +1000 Subject: [PATCH 228/321] mm/migrate_device: implement THP migration of zone device pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating device pages as compound pages during device pfn migration. migrate_device code paths go through the collect, setup and finalize phases of migration. The entries in src and dst arrays passed to these functions still remain at a PAGE_SIZE granularity. When a compound page is passed, the first entry has the PFN along with MIGRATE_PFN_COMPOUND and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This representation allows for the compound page to be split into smaller page sizes. migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP page aware. Two new helper functions migrate_vma_collect_huge_pmd() and migrate_vma_insert_huge_pmd_page() have been added. migrate_vma_collect_huge_pmd() can collect THP pages, but if for some reason this fails, there is fallback support to split the folio and migrate it. migrate_vma_insert_huge_pmd_page() closely follows the logic of migrate_vma_insert_page() Support for splitting pages as needed for migration will follow in later patches in this series. Link: https://lkml.kernel.org/r/20251001065707.920170-8-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 + mm/migrate_device.c | 471 ++++++++++++++++++++++++++++++++++------ 2 files changed, 409 insertions(+), 64 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1f0ac122c3bf..41b4cc05a450 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node) #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) @@ -143,6 +144,7 @@ enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3, }; struct migrate_vma { diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e6bcd6dc5129..a0a315f3572a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "internal.h" @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start, if (!vma_is_anonymous(walk->vma)) return migrate_vma_collect_skip(start, end, walk); + if (thp_migration_supported() && + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + + /* + * Collect the remaining entries as holes, in case we + * need to split later + */ + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + } + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; @@ -103,57 +121,151 @@ static int migrate_vma_split_folio(struct folio *folio, return 0; } +/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the + * folio for device private pages. + * @pmdp: pointer to pmd entry + * @start: start address of the range for migration + * @end: end address of the range for migration + * @walk: mm_walk callback structure + * @fault_folio: folio associated with the fault if any + * + * Collect the huge pmd entry at @pmdp for migration and set the + * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that + * migration will occur at HPAGE_PMD granularity + */ +static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, + unsigned long end, struct mm_walk *walk, + struct folio *fault_folio) +{ + struct mm_struct *mm = walk->mm; + struct folio *folio; + struct migrate_vma *migrate = walk->private; + spinlock_t *ptl; + swp_entry_t entry; + int ret; + unsigned long write = 0; + + ptl = pmd_lock(mm, pmdp); + if (pmd_none(*pmdp)) { + spin_unlock(ptl); + return migrate_vma_collect_hole(start, end, -1, walk); + } + + if (pmd_trans_huge(*pmdp)) { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { + spin_unlock(ptl); + return migrate_vma_collect_skip(start, end, walk); + } + + folio = pmd_folio(*pmdp); + if (is_huge_zero_folio(folio)) { + spin_unlock(ptl); + return migrate_vma_collect_hole(start, end, -1, walk); + } + if (pmd_write(*pmdp)) + write = MIGRATE_PFN_WRITE; + } else if (!pmd_present(*pmdp)) { + entry = pmd_to_swp_entry(*pmdp); + folio = pfn_swap_entry_folio(entry); + + if (!is_device_private_entry(entry) || + !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + (folio->pgmap->owner != migrate->pgmap_owner)) { + spin_unlock(ptl); + return migrate_vma_collect_skip(start, end, walk); + } + + if (is_migration_entry(entry)) { + migration_entry_wait_on_locked(entry, ptl); + spin_unlock(ptl); + return -EAGAIN; + } + + if (is_writable_device_private_entry(entry)) + write = MIGRATE_PFN_WRITE; + } else { + spin_unlock(ptl); + return -EAGAIN; + } + + folio_get(folio); + if (folio != fault_folio && unlikely(!folio_trylock(folio))) { + spin_unlock(ptl); + folio_put(folio); + return migrate_vma_collect_skip(start, end, walk); + } + + if (thp_migration_supported() && + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { + + struct page_vma_mapped_walk pvmw = { + .ptl = ptl, + .address = start, + .pmd = pmdp, + .vma = walk->vma, + }; + + unsigned long pfn = page_to_pfn(folio_page(folio, 0)); + + migrate->src[migrate->npages] = migrate_pfn(pfn) | write + | MIGRATE_PFN_MIGRATE + | MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages++] = 0; + migrate->cpages++; + ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0)); + if (ret) { + migrate->npages--; + migrate->cpages--; + migrate->src[migrate->npages] = 0; + migrate->dst[migrate->npages] = 0; + goto fallback; + } + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + spin_unlock(ptl); + return 0; + } + +fallback: + spin_unlock(ptl); + if (!folio_test_large(folio)) + goto done; + ret = split_folio(folio); + if (fault_folio != folio) + folio_unlock(folio); + folio_put(folio); + if (ret) + return migrate_vma_collect_skip(start, end, walk); + if (pmd_none(pmdp_get_lockless(pmdp))) + return migrate_vma_collect_hole(start, end, -1, walk); + +done: + return -ENOENT; +} + static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; - struct folio *fault_folio = migrate->fault_page ? - page_folio(migrate->fault_page) : NULL; struct vm_area_struct *vma = walk->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = start, unmapped = 0; spinlock_t *ptl; + struct folio *fault_folio = migrate->fault_page ? + page_folio(migrate->fault_page) : NULL; pte_t *ptep; again: - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, walk); + if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) { + int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio); - if (pmd_trans_huge(*pmdp)) { - struct folio *folio; - - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { - spin_unlock(ptl); + if (ret == -EAGAIN) goto again; - } - - folio = pmd_folio(*pmdp); - if (is_huge_zero_folio(folio)) { - spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - } else { - int ret; - - folio_get(folio); - spin_unlock(ptl); - /* FIXME: we don't expect THP for fault_folio */ - if (WARN_ON_ONCE(fault_folio == folio)) - return migrate_vma_collect_skip(start, end, - walk); - if (unlikely(!folio_trylock(folio))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_folio(folio); - if (fault_folio != folio) - folio_unlock(folio); - folio_put(folio); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); - } + if (ret == 0) + return 0; } ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); @@ -243,8 +355,7 @@ again: mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { + if (!page || !page->mapping) { mpfn = 0; goto next; } @@ -415,14 +526,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) */ int extra = 1 + (page == fault_page); - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (folio_test_large(folio)) - return false; - /* Page from ZONE_DEVICE have one extra reference */ if (folio_is_zone_device(folio)) extra++; @@ -453,17 +556,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, lru_add_drain(); - for (i = 0; i < npages; i++) { + for (i = 0; i < npages; ) { struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; + unsigned int nr = 1; if (!page) { if (src_pfns[i] & MIGRATE_PFN_MIGRATE) unmapped++; - continue; + goto next; } folio = page_folio(page); + nr = folio_nr_pages(folio); + + if (nr > 1) + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + + /* ZONE_DEVICE folios are not on LRU */ if (!folio_is_zone_device(folio)) { if (!folio_test_lru(folio) && allow_drain) { @@ -475,7 +585,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, if (!folio_isolate_lru(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; - continue; + goto next; } /* Drop the reference we took in collect */ @@ -494,10 +604,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; - continue; + goto next; } unmapped++; +next: + i += nr; } for (i = 0; i < npages && restore; i++) { @@ -643,6 +755,160 @@ int migrate_vma_setup(struct migrate_vma *args) } EXPORT_SYMBOL(migrate_vma_setup); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm + * at @addr. folio is already allocated as a part of the migration process with + * large page. + * + * @page needs to be initialized and setup after it's allocated. The code bits + * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does + * not support THP zero pages. + * + * @migrate: migrate_vma arguments + * @addr: address where the folio will be inserted + * @page: page to be inserted at @addr + * @src: src pfn which is being migrated + * @pmdp: pointer to the pmd + */ +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + pmd_t *pmdp) +{ + struct vm_area_struct *vma = migrate->vma; + gfp_t gfp = vma_thp_gfp_mask(vma); + struct folio *folio = page_folio(page); + int ret; + vm_fault_t csa_ret; + spinlock_t *ptl; + pgtable_t pgtable; + pmd_t entry; + bool flush = false; + unsigned long i; + + VM_WARN_ON_FOLIO(!folio, folio); + VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); + + if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) + return -EINVAL; + + ret = anon_vma_prepare(vma); + if (ret) + return ret; + + folio_set_order(folio, HPAGE_PMD_ORDER); + folio_set_large_rmappable(folio); + + if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + ret = -ENOMEM; + goto abort; + } + + __folio_mark_uptodate(folio); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) + goto abort; + + if (folio_is_device_private(folio)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pmd(swp_entry); + } else { + if (folio_is_zone_device(folio) && + !folio_is_device_coherent(folio)) { + goto abort; + } + entry = folio_mk_pmd(folio, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pmd_mkwrite(pmd_mkdirty(entry), vma); + } + + ptl = pmd_lock(vma->vm_mm, pmdp); + csa_ret = check_stable_address_space(vma->vm_mm); + if (csa_ret) + goto abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + if (!pmd_none(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto unlock_abort; + flush = true; + } else if (!pmd_none(*pmdp)) + goto unlock_abort; + + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + if (!folio_is_zone_device(folio)) + folio_add_lru_vma(folio, vma); + folio_get(folio); + + if (flush) { + pte_free(vma->vm_mm, pgtable); + flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, addr, pmdp); + } else { + pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } + set_pmd_at(vma->vm_mm, addr, pmdp, entry); + update_mmu_cache_pmd(vma, addr, pmdp); + + spin_unlock(ptl); + + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + + return 0; + +unlock_abort: + spin_unlock(ptl); +abort: + for (i = 0; i < HPAGE_PMD_NR; i++) + src[i] &= ~MIGRATE_PFN_MIGRATE; + return 0; +} +#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + pmd_t *pmdp) +{ + return 0; +} +#endif + +static unsigned long migrate_vma_nr_pages(unsigned long *src) +{ + unsigned long nr = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (*src & MIGRATE_PFN_COMPOUND) + nr = HPAGE_PMD_NR; +#else + if (*src & MIGRATE_PFN_COMPOUND) + VM_WARN_ON_ONCE(true); +#endif + return nr; +} + /* * This code closely matches the code in: * __handle_mm_fault() @@ -653,9 +919,10 @@ EXPORT_SYMBOL(migrate_vma_setup); */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, - struct page *page, + unsigned long *dst, unsigned long *src) { + struct page *page = migrate_pfn_to_page(*dst); struct folio *folio = page_folio(page); struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; @@ -683,8 +950,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp)) - goto abort; + + if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) { + int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page, + src, pmdp); + if (ret) + goto abort; + return; + } + + if (!pmd_none(*pmdp)) { + if (pmd_trans_huge(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto abort; + split_huge_pmd(vma, pmdp, addr); + } else if (pmd_leaf(*pmdp)) + goto abort; + } + if (pte_alloc(mm, pmdp)) goto abort; if (unlikely(anon_vma_prepare(vma))) @@ -775,23 +1058,24 @@ static void __migrate_device_pages(unsigned long *src_pfns, unsigned long i; bool notified = false; - for (i = 0; i < npages; i++) { + for (i = 0; i < npages; ) { struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; struct folio *newfolio, *folio; int r, extra_cnt = 0; + unsigned long nr = 1; if (!newpage) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } if (!page) { unsigned long addr; if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; + goto next; /* * The only time there is no vma is when called from @@ -809,15 +1093,47 @@ static void __migrate_device_pages(unsigned long *src_pfns, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); } - migrate_vma_insert_page(migrate, addr, newpage, + + if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) && + (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { + nr = migrate_vma_nr_pages(&src_pfns[i]); + src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + goto next; + } + + migrate_vma_insert_page(migrate, addr, &dst_pfns[i], &src_pfns[i]); - continue; + goto next; } newfolio = page_folio(newpage); folio = page_folio(page); mapping = folio_mapping(folio); + /* + * If THP migration is enabled, check if both src and dst + * can migrate large pages + */ + if (thp_migration_supported()) { + if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && + (src_pfns[i] & MIGRATE_PFN_COMPOUND) && + !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) { + + if (!migrate) { + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND); + goto next; + } + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && + (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && + !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + } + } + + if (folio_is_device_private(newfolio) || folio_is_device_coherent(newfolio)) { if (mapping) { @@ -830,7 +1146,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (!folio_test_anon(folio) || !folio_free_swap(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } } } else if (folio_is_zone_device(newfolio)) { @@ -838,7 +1154,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, * Other types of ZONE_DEVICE page are not supported. */ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } BUG_ON(folio_test_writeback(folio)); @@ -850,6 +1166,8 @@ static void __migrate_device_pages(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); +next: + i += nr; } if (notified) @@ -1011,10 +1329,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) int migrate_device_range(unsigned long *src_pfns, unsigned long start, unsigned long npages) { - unsigned long i, pfn; + unsigned long i, j, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (pfn = start, i = 0; i < npages; pfn++, i++) src_pfns[i] = migrate_device_pfn_lock(pfn); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + pfn += j - 1; + } + } migrate_device_unmap(src_pfns, npages, NULL); @@ -1032,10 +1363,22 @@ EXPORT_SYMBOL(migrate_device_range); */ int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) { - unsigned long i; + unsigned long i, j; + + for (i = 0; i < npages; i++) { + struct page *page = pfn_to_page(src_pfns[i]); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (i = 0; i < npages; i++) src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + } + } migrate_device_unmap(src_pfns, npages, NULL); From 4964099163d0524a769d039ffa886bb4515136d0 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:59 +1000 Subject: [PATCH 229/321] mm/memory/fault: add THP fault handling for zone device private pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement CPU fault handling for zone device THP entries through do_huge_pmd_device_private(), enabling transparent migration of device-private large pages back to system memory on CPU access. When the CPU accesses a zone device THP entry, the fault handler calls the device driver's migrate_to_ram() callback to migrate the entire large page back to system memory. Link: https://lkml.kernel.org/r/20251001065707.920170-9-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 7 +++++++ mm/huge_memory.c | 38 ++++++++++++++++++++++++++++++++++++++ mm/memory.c | 5 +++-- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fee4cf7fa300..82408c90b396 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -481,6 +481,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); + extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; @@ -662,6 +664,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } +static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + return 0; +} + static inline bool is_huge_zero_folio(const struct folio *folio) { return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23db562cde07..ded707a50af8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1288,6 +1288,44 @@ release: } +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + spinlock_t *ptl; + swp_entry_t swp_entry; + struct page *page; + struct folio *folio; + + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + + ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) { + spin_unlock(ptl); + return 0; + } + + swp_entry = pmd_to_swp_entry(vmf->orig_pmd); + page = pfn_swap_entry_to_page(swp_entry); + folio = page_folio(page); + vmf->page = page; + vmf->pte = NULL; + if (folio_trylock(folio)) { + folio_get(folio); + spin_unlock(ptl); + ret = page_pgmap(page)->ops->migrate_to_ram(vmf); + folio_unlock(folio); + folio_put(folio); + } else { + spin_unlock(ptl); + } + + return ret; +} + /* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available diff --git a/mm/memory.c b/mm/memory.c index 27bc457b32c2..732414852570 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6345,8 +6345,9 @@ retry_pud: vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(vmf.orig_pmd)); + if (is_pmd_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); + if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; From 775465fd26a325359887f9c3129444fcc76c6298 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:00 +1000 Subject: [PATCH 230/321] lib/test_hmm: add zone device private THP test infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance the hmm test driver (lib/test_hmm) with support for THP pages. A new pool of free_folios() has now been added to the dmirror device, which can be allocated when a request for a THP zone device private page is made. Add compound page awareness to the allocation function during normal migration and fault based migration. These routines also copy folio_nr_pages() when moving data between system memory and device memory. args.src and args.dst used to hold migration entries are now dynamically allocated (as they need to hold HPAGE_PMD_NR entries or more). Split and migrate support will be added in future patches in this series. Link: https://lkml.kernel.org/r/20251001065707.920170-10-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ++ lib/test_hmm.c | 368 +++++++++++++++++++++++++++++++-------- 2 files changed, 304 insertions(+), 76 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index cd28d1666801..7df4dd037b69 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -177,6 +177,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio) folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline void *folio_zone_device_data(const struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + return folio->page.zone_device_data; +} + +static inline void folio_set_zone_device_data(struct folio *folio, void *data) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + folio->page.zone_device_data = data; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 9dbf265d1036..32d402e80bcc 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -119,6 +119,7 @@ struct dmirror_device { unsigned long calloc; unsigned long cfree; struct page *free_pages; + struct folio *free_folios; spinlock_t lock; /* protects the above */ }; @@ -492,7 +493,7 @@ fini: } static int dmirror_allocate_chunk(struct dmirror_device *mdevice, - struct page **ppage) + struct page **ppage, bool is_large) { struct dmirror_chunk *devmem; struct resource *res = NULL; @@ -572,20 +573,45 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, pfn_first, pfn_last); spin_lock(&mdevice->lock); - for (pfn = pfn_first; pfn < pfn_last; pfn++) { + for (pfn = pfn_first; pfn < pfn_last; ) { struct page *page = pfn_to_page(pfn); + if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR) + && (pfn + HPAGE_PMD_NR <= pfn_last)) { + page->zone_device_data = mdevice->free_folios; + mdevice->free_folios = page_folio(page); + pfn += HPAGE_PMD_NR; + continue; + } + page->zone_device_data = mdevice->free_pages; mdevice->free_pages = page; + pfn++; } + + ret = 0; if (ppage) { - *ppage = mdevice->free_pages; - mdevice->free_pages = (*ppage)->zone_device_data; - mdevice->calloc++; + if (is_large) { + if (!mdevice->free_folios) { + ret = -ENOMEM; + goto err_unlock; + } + *ppage = folio_page(mdevice->free_folios, 0); + mdevice->free_folios = (*ppage)->zone_device_data; + mdevice->calloc += HPAGE_PMD_NR; + } else if (mdevice->free_pages) { + *ppage = mdevice->free_pages; + mdevice->free_pages = (*ppage)->zone_device_data; + mdevice->calloc++; + } else { + ret = -ENOMEM; + goto err_unlock; + } } +err_unlock: spin_unlock(&mdevice->lock); - return 0; + return ret; err_release: mutex_unlock(&mdevice->devmem_lock); @@ -598,10 +624,13 @@ err_devmem: return ret; } -static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) +static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror, + bool is_large) { struct page *dpage = NULL; struct page *rpage = NULL; + unsigned int order = is_large ? HPAGE_PMD_ORDER : 0; + struct dmirror_device *mdevice = dmirror->mdevice; /* * For ZONE_DEVICE private type, this is a fake device so we allocate @@ -610,49 +639,55 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) * data and ignore rpage. */ if (dmirror_is_private_zone(mdevice)) { - rpage = alloc_page(GFP_HIGHUSER); + rpage = folio_page(folio_alloc(GFP_HIGHUSER, order), 0); if (!rpage) return NULL; } spin_lock(&mdevice->lock); - if (mdevice->free_pages) { + if (is_large && mdevice->free_folios) { + dpage = folio_page(mdevice->free_folios, 0); + mdevice->free_folios = dpage->zone_device_data; + mdevice->calloc += 1 << order; + spin_unlock(&mdevice->lock); + } else if (!is_large && mdevice->free_pages) { dpage = mdevice->free_pages; mdevice->free_pages = dpage->zone_device_data; mdevice->calloc++; spin_unlock(&mdevice->lock); } else { spin_unlock(&mdevice->lock); - if (dmirror_allocate_chunk(mdevice, &dpage)) + if (dmirror_allocate_chunk(mdevice, &dpage, is_large)) goto error; } - zone_device_page_init(dpage, 0); + zone_device_folio_init(page_folio(dpage), order); dpage->zone_device_data = rpage; return dpage; error: if (rpage) - __free_page(rpage); + __free_pages(rpage, order); return NULL; } static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, struct dmirror *dmirror) { - struct dmirror_device *mdevice = dmirror->mdevice; const unsigned long *src = args->src; unsigned long *dst = args->dst; unsigned long addr; - for (addr = args->start; addr < args->end; addr += PAGE_SIZE, - src++, dst++) { + for (addr = args->start; addr < args->end; ) { struct page *spage; struct page *dpage; struct page *rpage; + bool is_large = *src & MIGRATE_PFN_COMPOUND; + int write = (*src & MIGRATE_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0; + unsigned long nr = 1; if (!(*src & MIGRATE_PFN_MIGRATE)) - continue; + goto next; /* * Note that spage might be NULL which is OK since it is an @@ -662,17 +697,45 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, if (WARN(spage && is_zone_device_page(spage), "page already in device spage pfn: 0x%lx\n", page_to_pfn(spage))) - continue; + goto next; - dpage = dmirror_devmem_alloc_page(mdevice); - if (!dpage) + dpage = dmirror_devmem_alloc_page(dmirror, is_large); + if (!dpage) { + struct folio *folio; + unsigned long i; + unsigned long spfn = *src >> MIGRATE_PFN_SHIFT; + struct page *src_page; + + if (!is_large) + goto next; + + if (!spage && is_large) { + nr = HPAGE_PMD_NR; + } else { + folio = page_folio(spage); + nr = folio_nr_pages(folio); + } + + for (i = 0; i < nr && addr < args->end; i++) { + dpage = dmirror_devmem_alloc_page(dmirror, false); + rpage = BACKING_PAGE(dpage); + rpage->zone_device_data = dmirror; + + *dst = migrate_pfn(page_to_pfn(dpage)) | write; + src_page = pfn_to_page(spfn + i); + + if (spage) + copy_highpage(rpage, src_page); + else + clear_highpage(rpage); + src++; + dst++; + addr += PAGE_SIZE; + } continue; + } rpage = BACKING_PAGE(dpage); - if (spage) - copy_highpage(rpage, spage); - else - clear_highpage(rpage); /* * Normally, a device would use the page->zone_device_data to @@ -684,10 +747,42 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", page_to_pfn(spage), page_to_pfn(dpage)); - *dst = migrate_pfn(page_to_pfn(dpage)); - if ((*src & MIGRATE_PFN_WRITE) || - (!spage && args->vma->vm_flags & VM_WRITE)) - *dst |= MIGRATE_PFN_WRITE; + + *dst = migrate_pfn(page_to_pfn(dpage)) | write; + + if (is_large) { + int i; + struct folio *folio = page_folio(dpage); + *dst |= MIGRATE_PFN_COMPOUND; + + if (folio_test_large(folio)) { + for (i = 0; i < folio_nr_pages(folio); i++) { + struct page *dst_page = + pfn_to_page(page_to_pfn(rpage) + i); + struct page *src_page = + pfn_to_page(page_to_pfn(spage) + i); + + if (spage) + copy_highpage(dst_page, src_page); + else + clear_highpage(dst_page); + src++; + dst++; + addr += PAGE_SIZE; + } + continue; + } + } + + if (spage) + copy_highpage(rpage, spage); + else + clear_highpage(rpage); + +next: + src++; + dst++; + addr += PAGE_SIZE; } } @@ -734,14 +829,17 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, const unsigned long *src = args->src; const unsigned long *dst = args->dst; unsigned long pfn; + const unsigned long start_pfn = start >> PAGE_SHIFT; + const unsigned long end_pfn = end >> PAGE_SHIFT; /* Map the migrated pages into the device's page tables. */ mutex_lock(&dmirror->mutex); - for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, - src++, dst++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++, src++, dst++) { struct page *dpage; void *entry; + int nr, i; + struct page *rpage; if (!(*src & MIGRATE_PFN_MIGRATE)) continue; @@ -750,13 +848,25 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, if (!dpage) continue; - entry = BACKING_PAGE(dpage); - if (*dst & MIGRATE_PFN_WRITE) - entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); - entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); - if (xa_is_err(entry)) { - mutex_unlock(&dmirror->mutex); - return xa_err(entry); + if (*dst & MIGRATE_PFN_COMPOUND) + nr = folio_nr_pages(page_folio(dpage)); + else + nr = 1; + + WARN_ON_ONCE(end_pfn < start_pfn + nr); + + rpage = BACKING_PAGE(dpage); + VM_WARN_ON(folio_nr_pages(page_folio(rpage)) != nr); + + for (i = 0; i < nr; i++) { + entry = folio_page(page_folio(rpage), i); + if (*dst & MIGRATE_PFN_WRITE) + entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); + entry = xa_store(&dmirror->pt, pfn + i, entry, GFP_ATOMIC); + if (xa_is_err(entry)) { + mutex_unlock(&dmirror->mutex); + return xa_err(entry); + } } } @@ -829,31 +939,66 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, unsigned long start = args->start; unsigned long end = args->end; unsigned long addr; + unsigned int order = 0; + int i; - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { + for (addr = start; addr < end; ) { struct page *dpage, *spage; spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) { + addr += PAGE_SIZE; + goto next; + } if (WARN_ON(!is_device_private_page(spage) && - !is_device_coherent_page(spage))) - continue; - spage = BACKING_PAGE(spage); - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", - page_to_pfn(spage), page_to_pfn(dpage)); + !is_device_coherent_page(spage))) { + addr += PAGE_SIZE; + goto next; + } + spage = BACKING_PAGE(spage); + order = folio_order(page_folio(spage)); + + if (order) + dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE, + order, args->vma, addr), 0); + else + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + + /* Try with smaller pages if large allocation fails */ + if (!dpage && order) { + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + return VM_FAULT_OOM; + order = 0; + } + + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); lock_page(dpage); xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); copy_highpage(dpage, spage); *dst = migrate_pfn(page_to_pfn(dpage)); if (*src & MIGRATE_PFN_WRITE) *dst |= MIGRATE_PFN_WRITE; + if (order) + *dst |= MIGRATE_PFN_COMPOUND; + + for (i = 0; i < (1 << order); i++) { + struct page *src_page; + struct page *dst_page; + + src_page = pfn_to_page(page_to_pfn(spage) + i); + dst_page = pfn_to_page(page_to_pfn(dpage) + i); + + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dst_page, src_page); + } +next: + addr += PAGE_SIZE << order; + src += 1 << order; + dst += 1 << order; } return 0; } @@ -879,11 +1024,14 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[32] = { 0 }; - unsigned long dst_pfns[32] = { 0 }; struct migrate_vma args = { 0 }; unsigned long next; int ret; + unsigned long *src_pfns; + unsigned long *dst_pfns; + + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); start = cmd->addr; end = start + size; @@ -902,7 +1050,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, ret = -EINVAL; goto out; } - next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); if (next > vma->vm_end) next = vma->vm_end; @@ -912,7 +1060,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, args.start = addr; args.end = next; args.pgmap_owner = dmirror->mdevice; - args.flags = dmirror_select_device(dmirror); + args.flags = dmirror_select_device(dmirror) | MIGRATE_VMA_SELECT_COMPOUND; ret = migrate_vma_setup(&args); if (ret) @@ -928,6 +1076,8 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, out: mmap_read_unlock(mm); mmput(mm); + kvfree(src_pfns); + kvfree(dst_pfns); return ret; } @@ -939,12 +1089,12 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[32] = { 0 }; - unsigned long dst_pfns[32] = { 0 }; struct dmirror_bounce bounce; struct migrate_vma args = { 0 }; unsigned long next; int ret; + unsigned long *src_pfns = NULL; + unsigned long *dst_pfns = NULL; start = cmd->addr; end = start + size; @@ -955,6 +1105,18 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, if (!mmget_not_zero(mm)) return -EINVAL; + ret = -ENOMEM; + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), + GFP_KERNEL | __GFP_NOFAIL); + if (!src_pfns) + goto free_mem; + + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), + GFP_KERNEL | __GFP_NOFAIL); + if (!dst_pfns) + goto free_mem; + + ret = 0; mmap_read_lock(mm); for (addr = start; addr < end; addr = next) { vma = vma_lookup(mm, addr); @@ -962,7 +1124,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, ret = -EINVAL; goto out; } - next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); if (next > vma->vm_end) next = vma->vm_end; @@ -972,7 +1134,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, args.start = addr; args.end = next; args.pgmap_owner = dmirror->mdevice; - args.flags = MIGRATE_VMA_SELECT_SYSTEM; + args.flags = MIGRATE_VMA_SELECT_SYSTEM | + MIGRATE_VMA_SELECT_COMPOUND; ret = migrate_vma_setup(&args); if (ret) goto out; @@ -992,7 +1155,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) - return ret; + goto free_mem; mutex_lock(&dmirror->mutex); ret = dmirror_do_read(dmirror, start, end, &bounce); mutex_unlock(&dmirror->mutex); @@ -1003,11 +1166,14 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, } cmd->cpages = bounce.cpages; dmirror_bounce_fini(&bounce); - return ret; + goto free_mem; out: mmap_read_unlock(mm); mmput(mm); +free_mem: + kfree(src_pfns); + kfree(dst_pfns); return ret; } @@ -1200,6 +1366,7 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) unsigned long i; unsigned long *src_pfns; unsigned long *dst_pfns; + unsigned int order = 0; src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); @@ -1215,13 +1382,25 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) if (WARN_ON(!is_device_private_page(spage) && !is_device_coherent_page(spage))) continue; + + order = folio_order(page_folio(spage)); spage = BACKING_PAGE(spage); - dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { + dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE, + order), 0); + } else { + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + order = 0; + } + + /* TODO Support splitting here */ lock_page(dpage); - copy_highpage(dpage, spage); dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); if (src_pfns[i] & MIGRATE_PFN_WRITE) dst_pfns[i] |= MIGRATE_PFN_WRITE; + if (order) + dst_pfns[i] |= MIGRATE_PFN_COMPOUND; + folio_copy(page_folio(dpage), page_folio(spage)); } migrate_device_pages(src_pfns, dst_pfns, npages); migrate_device_finalize(src_pfns, dst_pfns, npages); @@ -1234,7 +1413,12 @@ static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) { struct dmirror_device *mdevice = devmem->mdevice; struct page *page; + struct folio *folio; + + for (folio = mdevice->free_folios; folio; folio = folio_zone_device_data(folio)) + if (dmirror_page_to_chunk(folio_page(folio, 0)) == devmem) + mdevice->free_folios = folio_zone_device_data(folio); for (page = mdevice->free_pages; page; page = page->zone_device_data) if (dmirror_page_to_chunk(page) == devmem) mdevice->free_pages = page->zone_device_data; @@ -1265,6 +1449,7 @@ static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) mdevice->devmem_count = 0; mdevice->devmem_capacity = 0; mdevice->free_pages = NULL; + mdevice->free_folios = NULL; kfree(mdevice->devmem_chunks); mdevice->devmem_chunks = NULL; } @@ -1379,18 +1564,30 @@ static void dmirror_devmem_free(struct folio *folio) struct page *page = &folio->page; struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; + struct folio *rfolio = page_folio(rpage); + unsigned int order = folio_order(rfolio); - if (rpage != page) - __free_page(rpage); + if (rpage != page) { + if (order) + __free_pages(rpage, order); + else + __free_page(rpage); + rpage = NULL; + } mdevice = dmirror_page_to_device(page); spin_lock(&mdevice->lock); /* Return page to our allocator if not freeing the chunk */ if (!dmirror_page_to_chunk(page)->remove) { - mdevice->cfree++; - page->zone_device_data = mdevice->free_pages; - mdevice->free_pages = page; + mdevice->cfree += 1 << order; + if (order) { + page->zone_device_data = mdevice->free_folios; + mdevice->free_folios = page_folio(page); + } else { + page->zone_device_data = mdevice->free_pages; + mdevice->free_pages = page; + } } spin_unlock(&mdevice->lock); } @@ -1398,36 +1595,52 @@ static void dmirror_devmem_free(struct folio *folio) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args = { 0 }; - unsigned long src_pfns = 0; - unsigned long dst_pfns = 0; struct page *rpage; struct dmirror *dmirror; - vm_fault_t ret; + vm_fault_t ret = 0; + unsigned int order, nr; /* * Normally, a device would use the page->zone_device_data to point to * the mirror but here we use it to hold the page for the simulated * device memory and that page holds the pointer to the mirror. */ - rpage = vmf->page->zone_device_data; + rpage = folio_zone_device_data(page_folio(vmf->page)); dmirror = rpage->zone_device_data; /* FIXME demonstrate how we can adjust migrate range */ + order = folio_order(page_folio(vmf->page)); + nr = 1 << order; + + /* + * Consider a per-cpu cache of src and dst pfns, but with + * large number of cpus that might not scale well. + */ + args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order)); args.vma = vmf->vma; - args.start = vmf->address; - args.end = args.start + PAGE_SIZE; - args.src = &src_pfns; - args.dst = &dst_pfns; + args.end = args.start + (PAGE_SIZE << order); + + nr = (args.end - args.start) >> PAGE_SHIFT; + args.src = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); + args.dst = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); args.fault_page = vmf->page; + if (!args.src || !args.dst) { + ret = VM_FAULT_OOM; + goto err; + } + + if (order) + args.flags |= MIGRATE_VMA_SELECT_COMPOUND; + if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); if (ret) - return ret; + goto err; migrate_vma_pages(&args); /* * No device finalize step is needed since @@ -1435,7 +1648,10 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) * invalidated the device page table. */ migrate_vma_finalize(&args); - return 0; +err: + kfree(args.src); + kfree(args.dst); + return ret; } static const struct dev_pagemap_ops dmirror_devmem_ops = { @@ -1466,7 +1682,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) return ret; /* Build a list of free ZONE_DEVICE struct pages */ - return dmirror_allocate_chunk(mdevice, NULL); + return dmirror_allocate_chunk(mdevice, NULL, false); } static void dmirror_device_remove(struct dmirror_device *mdevice) From 56ef398996435a0021569b86293d376649f12540 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:01 +1000 Subject: [PATCH 231/321] mm/memremap: add driver callback support for folio splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a zone device page is split (via huge pmd folio split). The driver callback for folio_split is invoked to let the device driver know that the folio size has been split into a smaller order. Provide a default implementation for drivers that do not provide this callback that copies the pgmap and mapping fields for the split folios. Update the HMM test driver to handle the split. Link: https://lkml.kernel.org/r/20251001065707.920170-11-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/memremap.h | 29 +++++++++++++++++++++++++++++ lib/test_hmm.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 7df4dd037b69..aca2b16d6889 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -100,6 +100,13 @@ struct dev_pagemap_ops { */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); + + /* + * Used for private (un-addressable) device memory only. + * This callback is used when a folio is split into + * a smaller folio + */ + void (*folio_split)(struct folio *head, struct folio *tail); }; #define PGMAP_ALTMAP_VALID (1 << 0) @@ -235,6 +242,23 @@ static inline void zone_device_folio_init(struct folio *folio, unsigned int orde folio_set_large_rmappable(folio); } +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ + if (folio_is_device_private(original_folio)) { + if (!original_folio->pgmap->ops->folio_split) { + if (new_folio) { + new_folio->pgmap = original_folio->pgmap; + new_folio->page.mapping = + original_folio->page.mapping; + } + } else { + original_folio->pgmap->ops->folio_split(original_folio, + new_folio); + } + } +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -268,6 +292,11 @@ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 32d402e80bcc..46fa9e200db8 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1654,9 +1654,44 @@ err: return ret; } +static void dmirror_devmem_folio_split(struct folio *head, struct folio *tail) +{ + struct page *rpage = BACKING_PAGE(folio_page(head, 0)); + struct page *rpage_tail; + struct folio *rfolio; + unsigned long offset = 0; + + if (!rpage) { + tail->page.zone_device_data = NULL; + return; + } + + rfolio = page_folio(rpage); + + if (tail == NULL) { + folio_reset_order(rfolio); + rfolio->mapping = NULL; + folio_set_count(rfolio, 1); + return; + } + + offset = folio_pfn(tail) - folio_pfn(head); + + rpage_tail = folio_page(rfolio, offset); + tail->page.zone_device_data = rpage_tail; + rpage_tail->zone_device_data = rpage->zone_device_data; + clear_compound_head(rpage_tail); + rpage_tail->mapping = NULL; + + folio_page(tail, 0)->mapping = folio_page(head, 0)->mapping; + tail->pgmap = head->pgmap; + folio_set_count(page_folio(rpage_tail), 1); +} + static const struct dev_pagemap_ops dmirror_devmem_ops = { .folio_free = dmirror_devmem_free, .migrate_to_ram = dmirror_devmem_fault, + .folio_split = dmirror_devmem_folio_split, }; static int dmirror_device_init(struct dmirror_device *mdevice, int id) From 4265d67e405a41562634279ca1ededf79fdadcd7 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:02 +1000 Subject: [PATCH 232/321] mm/migrate_device: add THP splitting during migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement migrate_vma_split_pages() to handle THP splitting during the migration process when destination cannot allocate compound pages. This addresses the common scenario where migrate_vma_setup() succeeds with MIGRATE_PFN_COMPOUND pages, but the destination device cannot allocate large pages during the migration phase. Key changes: - migrate_vma_split_pages(): Split already-isolated pages during migration - Enhanced folio_split() and __split_unmapped_folio() with isolated parameter to avoid redundant unmap/remap operations This provides a fallback mechansim to ensure migration succeeds even when large page allocation fails at the destination. [matthew.brost@intel.com: add THP splitting during migration] Link: https://lkml.kernel.org/r/20251120230825.181072-2-matthew.brost@intel.com Link: https://lkml.kernel.org/r/20251001065707.920170-12-balbirs@nvidia.com Signed-off-by: Balbir Singh Signed-off-by: Matthew Brost Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 11 +++++- lib/test_hmm.c | 9 +++++ mm/huge_memory.c | 46 ++++++++++++---------- mm/migrate_device.c | 87 +++++++++++++++++++++++++++++++++++------ 4 files changed, 119 insertions(+), 34 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 82408c90b396..ed99e6bd31ac 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -365,8 +365,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add vm_flags_t vm_flags); bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order); +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order, bool unmapped); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool uniform_split_supported(struct folio *folio, unsigned int new_order, @@ -375,6 +375,13 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); + +static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +{ + return __split_huge_page_to_list_to_order(page, list, new_order, false); +} + /* * try_folio_split_to_order - try to split a @folio at @page to @new_order using * non uniform split. diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 46fa9e200db8..df429670633e 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1612,6 +1612,15 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) order = folio_order(page_folio(vmf->page)); nr = 1 << order; + /* + * When folios are partially mapped, we can't rely on the folio + * order of vmf->page as the folio might not be fully split yet + */ + if (vmf->pte) { + order = 0; + nr = 1; + } + /* * Consider a per-cpu cache of src and dst pfns, but with * large number of cpus that might not scale well. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ded707a50af8..81e511f1ed26 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3452,15 +3452,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order, new_folio->mapping = folio->mapping; new_folio->index = folio->index + i; - /* - * page->private should not be set in tail pages. Fix up and warn once - * if private is unexpectedly set. - */ - if (unlikely(new_folio->private)) { - VM_WARN_ON_ONCE_PAGE(true, new_head); - new_folio->private = NULL; - } - if (folio_test_swapcache(folio)) new_folio->swap.val = folio->swap.val + i; @@ -3661,6 +3652,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL * @uniform_split: perform uniform split or not (non-uniform split) + * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3676,7 +3668,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, bool uniform_split) + struct list_head *list, bool uniform_split, bool unmapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); @@ -3736,13 +3728,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = folio_get_anon_vma(folio); - if (!anon_vma) { - ret = -EBUSY; - goto out; + if (!unmapped) { + anon_vma = folio_get_anon_vma(folio); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + anon_vma_lock_write(anon_vma); } mapping = NULL; - anon_vma_lock_write(anon_vma); } else { unsigned int min_order; gfp_t gfp; @@ -3795,7 +3789,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out_unlock; } - unmap_folio(folio); + if (!unmapped) + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -3882,10 +3877,13 @@ static int __folio_split(struct folio *folio, unsigned int new_order, next = folio_next(new_folio); + zone_device_private_split_cb(folio, new_folio); + expected_refs = folio_expected_ref_count(new_folio) + 1; folio_ref_unfreeze(new_folio, expected_refs); - lru_add_split_folio(folio, new_folio, lruvec, list); + if (!unmapped) + lru_add_split_folio(folio, new_folio, lruvec, list); /* * Anonymous folio with swap cache. @@ -3916,6 +3914,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, __filemap_remove_folio(new_folio, NULL); folio_put_refs(new_folio, nr_pages); } + + zone_device_private_split_cb(folio, NULL); /* * Unfreeze @folio only after all page cache entries, which * used to point to it, have been updated with new folios. @@ -3939,6 +3939,9 @@ fail: local_irq_enable(); + if (unmapped) + return ret; + if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); @@ -4029,12 +4032,13 @@ out: * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios. */ -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order) +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order, bool unmapped) { struct folio *folio = page_folio(page); - return __folio_split(folio, new_order, &folio->page, page, list, true); + return __folio_split(folio, new_order, &folio->page, page, list, true, + unmapped); } /* @@ -4063,7 +4067,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - false); + false, false); } int min_order_for_split(struct folio *folio) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a0a315f3572a..ab373fd38961 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -309,6 +309,25 @@ again: pgmap->owner != migrate->pgmap_owner) goto next; + folio = page_folio(page); + if (folio_test_large(folio)) { + int ret; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return migrate_vma_collect_skip(addr, end, walk); + } + + goto again; + } + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_writable_device_private_entry(entry)) @@ -885,6 +904,29 @@ abort: src[i] &= ~MIGRATE_PFN_MIGRATE; return 0; } + +static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, + unsigned long idx, unsigned long addr, + struct folio *folio) +{ + unsigned long i; + unsigned long pfn; + unsigned long flags; + int ret = 0; + + folio_get(folio); + split_huge_pmd_address(migrate->vma, addr, true); + ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, + 0, true); + if (ret) + return ret; + migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; + flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1); + pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT; + for (i = 1; i < HPAGE_PMD_NR; i++) + migrate->src[i+idx] = migrate_pfn(pfn + i) | flags; + return ret; +} #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, unsigned long addr, @@ -894,6 +936,13 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, { return 0; } + +static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, + unsigned long idx, unsigned long addr, + struct folio *folio) +{ + return 0; +} #endif static unsigned long migrate_vma_nr_pages(unsigned long *src) @@ -1055,8 +1104,9 @@ static void __migrate_device_pages(unsigned long *src_pfns, struct migrate_vma *migrate) { struct mmu_notifier_range range; - unsigned long i; + unsigned long i, j; bool notified = false; + unsigned long addr; for (i = 0; i < npages; ) { struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); @@ -1098,12 +1148,16 @@ static void __migrate_device_pages(unsigned long *src_pfns, (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { nr = migrate_vma_nr_pages(&src_pfns[i]); src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - goto next; + } else { + nr = 1; } - migrate_vma_insert_page(migrate, addr, &dst_pfns[i], - &src_pfns[i]); + for (j = 0; j < nr && i + j < npages; j++) { + src_pfns[i+j] |= MIGRATE_PFN_MIGRATE; + migrate_vma_insert_page(migrate, + addr + j * PAGE_SIZE, + &dst_pfns[i+j], &src_pfns[i+j]); + } goto next; } @@ -1125,7 +1179,13 @@ static void __migrate_device_pages(unsigned long *src_pfns, MIGRATE_PFN_COMPOUND); goto next; } - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + nr = 1 << folio_order(folio); + addr = migrate->start + i * PAGE_SIZE; + if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) { + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND); + goto next; + } } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { @@ -1161,11 +1221,16 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; - r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r) - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - else - folio_migrate_flags(newfolio, folio); + for (j = 0; j < nr && i + j < npages; j++) { + folio = page_folio(migrate_pfn_to_page(src_pfns[i+j])); + newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j])); + + r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); + if (r) + src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE; + else + folio_migrate_flags(newfolio, folio); + } next: i += nr; } From aa3ade429543a01eb04d27e1fb877b7ac2f8add7 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:03 +1000 Subject: [PATCH 233/321] lib/test_hmm: add large page allocation failure testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add HMM_DMIRROR_FLAG_FAIL_ALLOC flag to simulate large page allocation failures, enabling testing of split migration code paths. This test flag allows validation of the fallback behavior when destination device cannot allocate compound pages. This is useful for testing the split migration functionality. Link: https://lkml.kernel.org/r/20251001065707.920170-13-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- lib/test_hmm.c | 61 ++++++++++++++++++++++++++++++--------------- lib/test_hmm_uapi.h | 3 +++ 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index df429670633e..72a8b2f38d8a 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -92,6 +92,7 @@ struct dmirror { struct xarray pt; struct mmu_interval_notifier notifier; struct mutex mutex; + __u64 flags; }; /* @@ -699,7 +700,12 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, page_to_pfn(spage))) goto next; - dpage = dmirror_devmem_alloc_page(dmirror, is_large); + if (dmirror->flags & HMM_DMIRROR_FLAG_FAIL_ALLOC) { + dmirror->flags &= ~HMM_DMIRROR_FLAG_FAIL_ALLOC; + dpage = NULL; + } else + dpage = dmirror_devmem_alloc_page(dmirror, is_large); + if (!dpage) { struct folio *folio; unsigned long i; @@ -959,44 +965,55 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, spage = BACKING_PAGE(spage); order = folio_order(page_folio(spage)); - if (order) + *dst = MIGRATE_PFN_COMPOUND; + if (*src & MIGRATE_PFN_WRITE) + *dst |= MIGRATE_PFN_WRITE; + + if (dmirror->flags & HMM_DMIRROR_FLAG_FAIL_ALLOC) { + dmirror->flags &= ~HMM_DMIRROR_FLAG_FAIL_ALLOC; + *dst &= ~MIGRATE_PFN_COMPOUND; + dpage = NULL; + } else if (order) { dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE, order, args->vma, addr), 0); - else + } else { dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - - /* Try with smaller pages if large allocation fails */ - if (!dpage && order) { - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - return VM_FAULT_OOM; - order = 0; } + if (!dpage && !order) + return VM_FAULT_OOM; + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", page_to_pfn(spage), page_to_pfn(dpage)); - lock_page(dpage); - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); - copy_highpage(dpage, spage); - *dst = migrate_pfn(page_to_pfn(dpage)); - if (*src & MIGRATE_PFN_WRITE) - *dst |= MIGRATE_PFN_WRITE; - if (order) - *dst |= MIGRATE_PFN_COMPOUND; + + if (dpage) { + lock_page(dpage); + *dst |= migrate_pfn(page_to_pfn(dpage)); + } for (i = 0; i < (1 << order); i++) { struct page *src_page; struct page *dst_page; + /* Try with smaller pages if large allocation fails */ + if (!dpage && order) { + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + lock_page(dpage); + dst[i] = migrate_pfn(page_to_pfn(dpage)); + dst_page = pfn_to_page(page_to_pfn(dpage)); + dpage = NULL; /* For the next iteration */ + } else { + dst_page = pfn_to_page(page_to_pfn(dpage) + i); + } + src_page = pfn_to_page(page_to_pfn(spage) + i); - dst_page = pfn_to_page(page_to_pfn(dpage) + i); xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + addr += PAGE_SIZE; copy_highpage(dst_page, src_page); } next: - addr += PAGE_SIZE << order; src += 1 << order; dst += 1 << order; } @@ -1514,6 +1531,10 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, dmirror_device_remove_chunks(dmirror->mdevice); ret = 0; break; + case HMM_DMIRROR_FLAGS: + dmirror->flags = cmd.npages; + ret = 0; + break; default: return -EINVAL; diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index 8c818a2cf4f6..f94c6d457338 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -37,6 +37,9 @@ struct hmm_dmirror_cmd { #define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) #define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) #define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_FLAGS _IOWR('H', 0x08, struct hmm_dmirror_cmd) + +#define HMM_DMIRROR_FLAG_FAIL_ALLOC (1ULL << 0) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. From 519071529d2ad8a041e2e9d75ead5f9e6fe60026 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:04 +1000 Subject: [PATCH 234/321] selftests/mm/hmm-tests: new tests for zone device THP migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new tests for migrating anon THP pages, including anon_huge, anon_huge_zero and error cases involving forced splitting of pages during migration. Link: https://lkml.kernel.org/r/20251001065707.920170-14-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 410 +++++++++++++++++++++++++ 1 file changed, 410 insertions(+) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 15aadaf24a66..339a90183930 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -2055,4 +2055,414 @@ TEST_F(hmm, hmm_cow_in_device) hmm_buffer_free(buffer); } + +/* + * Migrate private anonymous huge empty page. + */ +TEST_F(hmm, migrate_anon_huge_empty) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page. + */ +TEST_F(hmm, migrate_anon_huge_zero) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + int val; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize a read-only zero huge page. */ + val = *(int *)buffer->ptr; + ASSERT_EQ(val, 0); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], 0); + /* If it asserts once, it probably will 500,000 times */ + if (ptr[i] != 0) + break; + } + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and free. + */ +TEST_F(hmm, migrate_anon_huge_free) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Try freeing it. */ + ret = madvise(map, size, MADV_FREE); + ASSERT_EQ(ret, 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and fault back to sysmem. + */ +TEST_F(hmm, migrate_anon_huge_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], i); + if (ptr[i] != i) + break; + } + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate THP to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* + * Force an allocation error when faulting back a THP resident in the + * device. + */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_zero_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 0); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory (zero THP page). */ + ret = ptr[0]; + ASSERT_EQ(ret, 0); + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Fault the device memory back and check it. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN From 24c2c5b8ffbd50d5dcd340c553c717948ca99aac Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 1 Oct 2025 16:57:05 +1000 Subject: [PATCH 235/321] selftests/mm/hmm-tests: partial unmap, mremap and anon_write tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add partial unmap test case which munmaps memory while in the device. Add tests exercising mremap on faulted-in memory (CPU and GPU) at various offsets and verify correctness. Update anon_write_child to read device memory after fork verifying this flow works in the kernel. Both THP and non-THP cases are updated. Link: https://lkml.kernel.org/r/20251001065707.920170-15-balbirs@nvidia.com Signed-off-by: Balbir Singh Signed-off-by: Matthew Brost Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 300 ++++++++++++++++++++----- 1 file changed, 246 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 339a90183930..dedc1049bd4d 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -50,6 +50,8 @@ enum { HMM_COHERENCE_DEVICE_TWO, }; +#define ONEKB (1 << 10) +#define ONEMEG (1 << 20) #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 @@ -525,6 +527,8 @@ TEST_F(hmm, anon_write_prot) /* * Check that a device writing an anonymous private mapping * will copy-on-write if a child process inherits the mapping. + * + * Also verifies after fork() memory the device can be read by child. */ TEST_F(hmm, anon_write_child) { @@ -532,72 +536,101 @@ TEST_F(hmm, anon_write_child) unsigned long npages; unsigned long size; unsigned long i; + void *old_ptr; + void *map; int *ptr; pid_t pid; int child_fd; - int ret; + int ret, use_thp, migrate; - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; + for (migrate = 0; migrate < 2; ++migrate) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + npages = ALIGN(use_thp ? TWOMEG : HMM_BUFFER_SIZE, + self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); + buffer->fd = -1; + buffer->size = size * 2; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); + buffer->ptr = mmap(NULL, size * 2, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; + old_ptr = buffer->ptr; + if (use_thp) { + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + } - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; + if (migrate) { + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + } + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + continue; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + if (!migrate) { + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + if (!migrate) { + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } + + close(child_fd); + exit(0); + } } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); } /* @@ -2289,6 +2322,165 @@ TEST_F(hmm, migrate_anon_huge_fault) hmm_buffer_free(buffer); } +/* + * Migrate memory and fault back to sysmem after partially unmapping. + */ +TEST_F(hmm, migrate_partial_unmap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret, j, use_thp; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr + offsets[j], ONEMEG); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + if (i * sizeof(int) < offsets[j] || + i * sizeof(int) >= offsets[j] + ONEMEG) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } +} + +TEST_F(hmm, migrate_remap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr, *new_ptr = NULL; + void *map; + int *ptr; + int ret, j, use_thp, dont_unmap, before; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (before = 0; before < 2; ++before) { + for (dont_unmap = 0; dont_unmap < 2; ++dont_unmap) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + int flags = MREMAP_MAYMOVE | MREMAP_FIXED; + + if (dont_unmap) + flags |= MREMAP_DONTUNMAP; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 8 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, buffer->size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + munmap(map + size, size * 2); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + if (before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + if (!before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(new_ptr, size); + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } + } + } +} + /* * Migrate private anonymous huge page with allocation errors. */ From 271a7b2e3c1370d36fde867bfee201bd74b53704 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:06 +1000 Subject: [PATCH 236/321] selftests/mm/hmm-tests: new throughput tests including THP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new benchmark style support to test transfer bandwidth for zone device memory operations. Link: https://lkml.kernel.org/r/20251001065707.920170-16-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 197 ++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index dedc1049bd4d..5a1525f72daa 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -25,6 +25,7 @@ #include #include #include +#include /* @@ -209,8 +210,10 @@ static void hmm_buffer_free(struct hmm_buffer *buffer) if (buffer == NULL) return; - if (buffer->ptr) + if (buffer->ptr) { munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + } free(buffer->mirror); free(buffer); } @@ -2657,4 +2660,196 @@ TEST_F(hmm, migrate_anon_huge_zero_err) buffer->ptr = old_ptr; hmm_buffer_free(buffer); } + +struct benchmark_results { + double sys_to_dev_time; + double dev_to_sys_time; + double throughput_s2d; + double throughput_d2s; +}; + +static double get_time_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000.0) + (tv.tv_usec / 1000.0); +} + +static inline struct hmm_buffer *hmm_buffer_alloc(unsigned long size) +{ + struct hmm_buffer *buffer; + + buffer = malloc(sizeof(*buffer)); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + memset(buffer->mirror, 0xFF, size); + return buffer; +} + +static void print_benchmark_results(const char *test_name, size_t buffer_size, + struct benchmark_results *thp, + struct benchmark_results *regular) +{ + double s2d_improvement = ((regular->sys_to_dev_time - thp->sys_to_dev_time) / + regular->sys_to_dev_time) * 100.0; + double d2s_improvement = ((regular->dev_to_sys_time - thp->dev_to_sys_time) / + regular->dev_to_sys_time) * 100.0; + double throughput_s2d_improvement = ((thp->throughput_s2d - regular->throughput_s2d) / + regular->throughput_s2d) * 100.0; + double throughput_d2s_improvement = ((thp->throughput_d2s - regular->throughput_d2s) / + regular->throughput_d2s) * 100.0; + + printf("\n=== %s (%.1f MB) ===\n", test_name, buffer_size / (1024.0 * 1024.0)); + printf(" | With THP | Without THP | Improvement\n"); + printf("---------------------------------------------------------------------\n"); + printf("Sys->Dev Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->sys_to_dev_time, regular->sys_to_dev_time, s2d_improvement); + printf("Dev->Sys Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->dev_to_sys_time, regular->dev_to_sys_time, d2s_improvement); + printf("S->D Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_s2d, regular->throughput_s2d, throughput_s2d_improvement); + printf("D->S Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_d2s, regular->throughput_d2s, throughput_d2s_improvement); +} + +/* + * Run a single migration benchmark + * fd: file descriptor for hmm device + * use_thp: whether to use THP + * buffer_size: size of buffer to allocate + * iterations: number of iterations + * results: where to store results + */ +static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_size, + int iterations, struct benchmark_results *results) +{ + struct hmm_buffer *buffer; + unsigned long npages = buffer_size / sysconf(_SC_PAGESIZE); + double start, end; + double s2d_total = 0, d2s_total = 0; + int ret, i; + int *ptr; + + buffer = hmm_buffer_alloc(buffer_size); + + /* Map memory */ + buffer->ptr = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (!buffer->ptr) + return -1; + + /* Apply THP hint if requested */ + if (use_thp) + ret = madvise(buffer->ptr, buffer_size, MADV_HUGEPAGE); + else + ret = madvise(buffer->ptr, buffer_size, MADV_NOHUGEPAGE); + + if (ret) + return ret; + + /* Initialize memory to make sure pages are allocated */ + ptr = (int *)buffer->ptr; + for (i = 0; i < buffer_size / sizeof(int); i++) + ptr[i] = i & 0xFF; + + /* Warmup iteration */ + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + /* Benchmark iterations */ + for (i = 0; i < iterations; i++) { + /* System to device migration */ + start = get_time_ms(); + + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + s2d_total += (end - start); + + /* Device to system migration */ + start = get_time_ms(); + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + d2s_total += (end - start); + } + + /* Calculate average times and throughput */ + results->sys_to_dev_time = s2d_total / iterations; + results->dev_to_sys_time = d2s_total / iterations; + results->throughput_s2d = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->sys_to_dev_time / 1000.0); + results->throughput_d2s = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->dev_to_sys_time / 1000.0); + + /* Cleanup */ + hmm_buffer_free(buffer); + return 0; +} + +/* + * Benchmark THP migration with different buffer sizes + */ +TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120) +{ + struct benchmark_results thp_results, regular_results; + size_t thp_size = 2 * 1024 * 1024; /* 2MB - typical THP size */ + int iterations = 5; + + printf("\nHMM THP Migration Benchmark\n"); + printf("---------------------------\n"); + printf("System page size: %ld bytes\n", sysconf(_SC_PAGESIZE)); + + /* Test different buffer sizes */ + size_t test_sizes[] = { + thp_size / 4, /* 512KB - smaller than THP */ + thp_size / 2, /* 1MB - half THP */ + thp_size, /* 2MB - single THP */ + thp_size * 2, /* 4MB - two THPs */ + thp_size * 4, /* 8MB - four THPs */ + thp_size * 8, /* 16MB - eight THPs */ + thp_size * 128, /* 256MB - one twenty eight THPs */ + }; + + static const char *const test_names[] = { + "Small Buffer (512KB)", + "Half THP Size (1MB)", + "Single THP Size (2MB)", + "Two THP Size (4MB)", + "Four THP Size (8MB)", + "Eight THP Size (16MB)", + "One twenty eight THP Size (256MB)" + }; + + int num_tests = ARRAY_SIZE(test_sizes); + + /* Run all tests */ + for (int i = 0; i < num_tests; i++) { + /* Test with THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 1, test_sizes[i], + iterations, &thp_results), 0); + + /* Test without THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 0, test_sizes[i], + iterations, ®ular_results), 0); + + /* Print results */ + print_benchmark_results(test_names[i], test_sizes[i], + &thp_results, ®ular_results); + } +} TEST_HARNESS_MAIN From c3228747107705d47c7e9a03598a434a0380cb73 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:07 +1000 Subject: [PATCH 237/321] gpu/drm/nouveau: enable THP support for GPU memory migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable MIGRATE_VMA_SELECT_COMPOUND support in nouveau driver to take advantage of THP zone device migration capabilities. Update migration and eviction code paths to handle compound page sizes appropriately, improving memory bandwidth utilization and reducing migration overhead for large GPU memory allocations. [balbirs@nvidia.com: fix sparse error] Link: https://lkml.kernel.org/r/20251115003333.3516870-1-balbirs@nvidia.com Link: https://lkml.kernel.org/r/20251001065707.920170-17-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 306 ++++++++++++++++++------- drivers/gpu/drm/nouveau/nouveau_svm.c | 6 +- drivers/gpu/drm/nouveau/nouveau_svm.h | 3 +- 3 files changed, 231 insertions(+), 84 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index d34288ebe7d2..58071652679d 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -50,6 +50,7 @@ */ #define DMEM_CHUNK_SIZE (2UL << 20) #define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT) +#define NR_CHUNKS (128) enum nouveau_aper { NOUVEAU_APER_VIRT, @@ -83,9 +84,15 @@ struct nouveau_dmem { struct list_head chunks; struct mutex mutex; struct page *free_pages; + struct folio *free_folios; spinlock_t lock; }; +struct nouveau_dmem_dma_info { + dma_addr_t dma_addr; + size_t size; +}; + static struct nouveau_dmem_chunk *nouveau_page_to_chunk(struct page *page) { return container_of(page_pgmap(page), struct nouveau_dmem_chunk, @@ -115,8 +122,13 @@ static void nouveau_dmem_folio_free(struct folio *folio) struct nouveau_dmem *dmem = chunk->drm->dmem; spin_lock(&dmem->lock); - page->zone_device_data = dmem->free_pages; - dmem->free_pages = page; + if (folio_order(folio)) { + page->zone_device_data = dmem->free_folios; + dmem->free_folios = folio; + } else { + page->zone_device_data = dmem->free_pages; + dmem->free_pages = page; + } WARN_ON(!chunk->callocated); chunk->callocated--; @@ -140,20 +152,28 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence) } } -static int nouveau_dmem_copy_one(struct nouveau_drm *drm, struct page *spage, - struct page *dpage, dma_addr_t *dma_addr) +static int nouveau_dmem_copy_folio(struct nouveau_drm *drm, + struct folio *sfolio, struct folio *dfolio, + struct nouveau_dmem_dma_info *dma_info) { struct device *dev = drm->dev->dev; + struct page *dpage = folio_page(dfolio, 0); + struct page *spage = folio_page(sfolio, 0); - lock_page(dpage); + folio_lock(dfolio); - *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); - if (dma_mapping_error(dev, *dma_addr)) + dma_info->dma_addr = dma_map_page(dev, dpage, 0, page_size(dpage), + DMA_BIDIRECTIONAL); + dma_info->size = page_size(dpage); + if (dma_mapping_error(dev, dma_info->dma_addr)) return -EIO; - if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, - NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) { - dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + if (drm->dmem->migrate.copy_func(drm, folio_nr_pages(sfolio), + NOUVEAU_APER_HOST, dma_info->dma_addr, + NOUVEAU_APER_VRAM, + nouveau_dmem_page_addr(spage))) { + dma_unmap_page(dev, dma_info->dma_addr, page_size(dpage), + DMA_BIDIRECTIONAL); return -EIO; } @@ -166,21 +186,48 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) struct nouveau_dmem *dmem = drm->dmem; struct nouveau_fence *fence; struct nouveau_svmm *svmm; - struct page *spage, *dpage; - unsigned long src = 0, dst = 0; - dma_addr_t dma_addr = 0; + struct page *dpage; vm_fault_t ret = 0; + int err; struct migrate_vma args = { .vma = vmf->vma, - .start = vmf->address, - .end = vmf->address + PAGE_SIZE, - .src = &src, - .dst = &dst, .pgmap_owner = drm->dev, .fault_page = vmf->page, - .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE, + .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | + MIGRATE_VMA_SELECT_COMPOUND, + .src = NULL, + .dst = NULL, }; + unsigned int order, nr; + struct folio *sfolio, *dfolio; + struct nouveau_dmem_dma_info dma_info; + sfolio = page_folio(vmf->page); + order = folio_order(sfolio); + nr = 1 << order; + + /* + * Handle partial unmap faults, where the folio is large, but + * the pmd is split. + */ + if (vmf->pte) { + order = 0; + nr = 1; + } + + if (order) + args.flags |= MIGRATE_VMA_SELECT_COMPOUND; + + args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order)); + args.vma = vmf->vma; + args.end = args.start + (PAGE_SIZE << order); + args.src = kcalloc(nr, sizeof(*args.src), GFP_KERNEL); + args.dst = kcalloc(nr, sizeof(*args.dst), GFP_KERNEL); + + if (!args.src || !args.dst) { + ret = VM_FAULT_OOM; + goto err; + } /* * FIXME what we really want is to find some heuristic to migrate more * than just one page on CPU fault. When such fault happens it is very @@ -191,22 +238,28 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) if (!args.cpages) return 0; - spage = migrate_pfn_to_page(src); - if (!spage || !(src & MIGRATE_PFN_MIGRATE)) + if (order) + dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER | __GFP_ZERO, + order, vmf->vma, vmf->address), 0); + else + dpage = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vmf->vma, + vmf->address); + if (!dpage) { + ret = VM_FAULT_OOM; goto done; + } - dpage = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vmf->vma, vmf->address); - if (!dpage) - goto done; + args.dst[0] = migrate_pfn(page_to_pfn(dpage)); + if (order) + args.dst[0] |= MIGRATE_PFN_COMPOUND; + dfolio = page_folio(dpage); - dst = migrate_pfn(page_to_pfn(dpage)); - - svmm = spage->zone_device_data; + svmm = folio_zone_device_data(sfolio); mutex_lock(&svmm->mutex); nouveau_svmm_invalidate(svmm, args.start, args.end); - ret = nouveau_dmem_copy_one(drm, spage, dpage, &dma_addr); + err = nouveau_dmem_copy_folio(drm, sfolio, dfolio, &dma_info); mutex_unlock(&svmm->mutex); - if (ret) { + if (err) { ret = VM_FAULT_SIGBUS; goto done; } @@ -214,25 +267,40 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) nouveau_fence_new(&fence, dmem->migrate.chan); migrate_vma_pages(&args); nouveau_dmem_fence_done(&fence); - dma_unmap_page(drm->dev->dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + dma_unmap_page(drm->dev->dev, dma_info.dma_addr, PAGE_SIZE, + DMA_BIDIRECTIONAL); done: migrate_vma_finalize(&args); +err: + kfree(args.src); + kfree(args.dst); return ret; } +static void nouveau_dmem_folio_split(struct folio *head, struct folio *tail) +{ + if (tail == NULL) + return; + tail->pgmap = head->pgmap; + tail->mapping = head->mapping; + folio_set_zone_device_data(tail, folio_zone_device_data(head)); +} + static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { .folio_free = nouveau_dmem_folio_free, .migrate_to_ram = nouveau_dmem_migrate_to_ram, + .folio_split = nouveau_dmem_folio_split, }; static int -nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) +nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage, + bool is_large) { struct nouveau_dmem_chunk *chunk; struct resource *res; struct page *page; void *ptr; - unsigned long i, pfn_first; + unsigned long i, pfn_first, pfn; int ret; chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); @@ -242,7 +310,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) } /* Allocate unused physical address space for device private pages. */ - res = request_free_mem_region(&iomem_resource, DMEM_CHUNK_SIZE, + res = request_free_mem_region(&iomem_resource, DMEM_CHUNK_SIZE * NR_CHUNKS, "nouveau_dmem"); if (IS_ERR(res)) { ret = PTR_ERR(res); @@ -275,16 +343,40 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT; page = pfn_to_page(pfn_first); spin_lock(&drm->dmem->lock); - for (i = 0; i < DMEM_CHUNK_NPAGES - 1; ++i, ++page) { - page->zone_device_data = drm->dmem->free_pages; - drm->dmem->free_pages = page; + + pfn = pfn_first; + for (i = 0; i < NR_CHUNKS; i++) { + int j; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) || !is_large) { + for (j = 0; j < DMEM_CHUNK_NPAGES - 1; j++, pfn++) { + page = pfn_to_page(pfn); + page->zone_device_data = drm->dmem->free_pages; + drm->dmem->free_pages = page; + } + } else { + page = pfn_to_page(pfn); + page->zone_device_data = drm->dmem->free_folios; + drm->dmem->free_folios = page_folio(page); + pfn += DMEM_CHUNK_NPAGES; + } } - *ppage = page; + + /* Move to next page */ + if (is_large) { + *ppage = &drm->dmem->free_folios->page; + drm->dmem->free_folios = (*ppage)->zone_device_data; + } else { + *ppage = drm->dmem->free_pages; + drm->dmem->free_pages = (*ppage)->zone_device_data; + } + chunk->callocated++; spin_unlock(&drm->dmem->lock); - NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", - DMEM_CHUNK_SIZE >> 20); + NV_INFO(drm, "DMEM: registered %ldMB of %sdevice memory %lx %lx\n", + NR_CHUNKS * DMEM_CHUNK_SIZE >> 20, is_large ? "THP " : "", pfn_first, + nouveau_dmem_page_addr(page)); return 0; @@ -299,27 +391,41 @@ out: } static struct page * -nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) +nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm, bool is_large) { struct nouveau_dmem_chunk *chunk; struct page *page = NULL; + struct folio *folio = NULL; int ret; + unsigned int order = 0; spin_lock(&drm->dmem->lock); - if (drm->dmem->free_pages) { + if (is_large && drm->dmem->free_folios) { + folio = drm->dmem->free_folios; + page = &folio->page; + drm->dmem->free_folios = page->zone_device_data; + chunk = nouveau_page_to_chunk(&folio->page); + chunk->callocated++; + spin_unlock(&drm->dmem->lock); + order = ilog2(DMEM_CHUNK_NPAGES); + } else if (!is_large && drm->dmem->free_pages) { page = drm->dmem->free_pages; drm->dmem->free_pages = page->zone_device_data; chunk = nouveau_page_to_chunk(page); chunk->callocated++; spin_unlock(&drm->dmem->lock); + folio = page_folio(page); } else { spin_unlock(&drm->dmem->lock); - ret = nouveau_dmem_chunk_alloc(drm, &page); + ret = nouveau_dmem_chunk_alloc(drm, &page, is_large); if (ret) return NULL; + folio = page_folio(page); + if (is_large) + order = ilog2(DMEM_CHUNK_NPAGES); } - zone_device_page_init(page, 0); + zone_device_folio_init(folio, order); return page; } @@ -370,12 +476,12 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) { unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT; unsigned long *src_pfns, *dst_pfns; - dma_addr_t *dma_addrs; + struct nouveau_dmem_dma_info *dma_info; struct nouveau_fence *fence; src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); - dma_addrs = kvcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL | __GFP_NOFAIL); + dma_info = kvcalloc(npages, sizeof(*dma_info), GFP_KERNEL | __GFP_NOFAIL); migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT, npages); @@ -383,17 +489,28 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) for (i = 0; i < npages; i++) { if (src_pfns[i] & MIGRATE_PFN_MIGRATE) { struct page *dpage; + struct folio *folio = page_folio( + migrate_pfn_to_page(src_pfns[i])); + unsigned int order = folio_order(folio); + + if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { + dpage = folio_page( + folio_alloc( + GFP_HIGHUSER_MOVABLE, order), 0); + } else { + /* + * _GFP_NOFAIL because the GPU is going away and there + * is nothing sensible we can do if we can't copy the + * data back. + */ + dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL); + } - /* - * _GFP_NOFAIL because the GPU is going away and there - * is nothing sensible we can do if we can't copy the - * data back. - */ - dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL); dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); - nouveau_dmem_copy_one(chunk->drm, - migrate_pfn_to_page(src_pfns[i]), dpage, - &dma_addrs[i]); + nouveau_dmem_copy_folio(chunk->drm, + page_folio(migrate_pfn_to_page(src_pfns[i])), + page_folio(dpage), + &dma_info[i]); } } @@ -404,8 +521,9 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) kvfree(src_pfns); kvfree(dst_pfns); for (i = 0; i < npages; i++) - dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL); - kvfree(dma_addrs); + dma_unmap_page(chunk->drm->dev->dev, dma_info[i].dma_addr, + dma_info[i].size, DMA_BIDIRECTIONAL); + kvfree(dma_info); } void @@ -608,31 +726,36 @@ nouveau_dmem_init(struct nouveau_drm *drm) static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, struct nouveau_svmm *svmm, unsigned long src, - dma_addr_t *dma_addr, u64 *pfn) + struct nouveau_dmem_dma_info *dma_info, u64 *pfn) { struct device *dev = drm->dev->dev; struct page *dpage, *spage; unsigned long paddr; + bool is_large = false; + unsigned long mpfn; spage = migrate_pfn_to_page(src); if (!(src & MIGRATE_PFN_MIGRATE)) goto out; - dpage = nouveau_dmem_page_alloc_locked(drm); + is_large = src & MIGRATE_PFN_COMPOUND; + dpage = nouveau_dmem_page_alloc_locked(drm, is_large); if (!dpage) goto out; paddr = nouveau_dmem_page_addr(dpage); if (spage) { - *dma_addr = dma_map_page(dev, spage, 0, page_size(spage), + dma_info->dma_addr = dma_map_page(dev, spage, 0, page_size(spage), DMA_BIDIRECTIONAL); - if (dma_mapping_error(dev, *dma_addr)) + dma_info->size = page_size(spage); + if (dma_mapping_error(dev, dma_info->dma_addr)) goto out_free_page; - if (drm->dmem->migrate.copy_func(drm, 1, - NOUVEAU_APER_VRAM, paddr, NOUVEAU_APER_HOST, *dma_addr)) + if (drm->dmem->migrate.copy_func(drm, folio_nr_pages(page_folio(spage)), + NOUVEAU_APER_VRAM, paddr, NOUVEAU_APER_HOST, + dma_info->dma_addr)) goto out_dma_unmap; } else { - *dma_addr = DMA_MAPPING_ERROR; + dma_info->dma_addr = DMA_MAPPING_ERROR; if (drm->dmem->migrate.clear_func(drm, page_size(dpage), NOUVEAU_APER_VRAM, paddr)) goto out_free_page; @@ -643,10 +766,13 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT); if (src & MIGRATE_PFN_WRITE) *pfn |= NVIF_VMM_PFNMAP_V0_W; - return migrate_pfn(page_to_pfn(dpage)); + mpfn = migrate_pfn(page_to_pfn(dpage)); + if (folio_order(page_folio(dpage))) + mpfn |= MIGRATE_PFN_COMPOUND; + return mpfn; out_dma_unmap: - dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + dma_unmap_page(dev, dma_info->dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); out_free_page: nouveau_dmem_page_free_locked(drm, dpage); out: @@ -656,27 +782,38 @@ out: static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm, struct nouveau_svmm *svmm, struct migrate_vma *args, - dma_addr_t *dma_addrs, u64 *pfns) + struct nouveau_dmem_dma_info *dma_info, u64 *pfns) { struct nouveau_fence *fence; unsigned long addr = args->start, nr_dma = 0, i; + unsigned long order = 0; + + for (i = 0; addr < args->end; ) { + struct folio *folio; - for (i = 0; addr < args->end; i++) { args->dst[i] = nouveau_dmem_migrate_copy_one(drm, svmm, - args->src[i], dma_addrs + nr_dma, pfns + i); - if (!dma_mapping_error(drm->dev->dev, dma_addrs[nr_dma])) + args->src[i], dma_info + nr_dma, pfns + i); + if (!args->dst[i]) { + i++; + addr += PAGE_SIZE; + continue; + } + if (!dma_mapping_error(drm->dev->dev, dma_info[nr_dma].dma_addr)) nr_dma++; - addr += PAGE_SIZE; + folio = page_folio(migrate_pfn_to_page(args->dst[i])); + order = folio_order(folio); + i += 1 << order; + addr += (1 << order) * PAGE_SIZE; } nouveau_fence_new(&fence, drm->dmem->migrate.chan); migrate_vma_pages(args); nouveau_dmem_fence_done(&fence); - nouveau_pfns_map(svmm, args->vma->vm_mm, args->start, pfns, i); + nouveau_pfns_map(svmm, args->vma->vm_mm, args->start, pfns, i, order); while (nr_dma--) { - dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE, - DMA_BIDIRECTIONAL); + dma_unmap_page(drm->dev->dev, dma_info[nr_dma].dma_addr, + dma_info[nr_dma].size, DMA_BIDIRECTIONAL); } migrate_vma_finalize(args); } @@ -689,20 +826,27 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, unsigned long end) { unsigned long npages = (end - start) >> PAGE_SHIFT; - unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages); - dma_addr_t *dma_addrs; + unsigned long max = npages; struct migrate_vma args = { .vma = vma, .start = start, .pgmap_owner = drm->dev, - .flags = MIGRATE_VMA_SELECT_SYSTEM, + .flags = MIGRATE_VMA_SELECT_SYSTEM + | MIGRATE_VMA_SELECT_COMPOUND, }; unsigned long i; u64 *pfns; int ret = -ENOMEM; + struct nouveau_dmem_dma_info *dma_info; - if (drm->dmem == NULL) - return -ENODEV; + if (drm->dmem == NULL) { + ret = -ENODEV; + goto out; + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + if (max > (unsigned long)HPAGE_PMD_NR) + max = (unsigned long)HPAGE_PMD_NR; args.src = kcalloc(max, sizeof(*args.src), GFP_KERNEL); if (!args.src) @@ -711,8 +855,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, if (!args.dst) goto out_free_src; - dma_addrs = kmalloc_array(max, sizeof(*dma_addrs), GFP_KERNEL); - if (!dma_addrs) + dma_info = kmalloc_array(max, sizeof(*dma_info), GFP_KERNEL); + if (!dma_info) goto out_free_dst; pfns = nouveau_pfns_alloc(max); @@ -730,7 +874,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, goto out_free_pfns; if (args.cpages) - nouveau_dmem_migrate_chunk(drm, svmm, &args, dma_addrs, + nouveau_dmem_migrate_chunk(drm, svmm, &args, dma_info, pfns); args.start = args.end; } @@ -739,7 +883,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, out_free_pfns: nouveau_pfns_free(pfns); out_free_dma: - kfree(dma_addrs); + kfree(dma_info); out_free_dst: kfree(args.dst); out_free_src: diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 6fa387da0637..b8a3378154d5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -921,12 +921,14 @@ nouveau_pfns_free(u64 *pfns) void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm, - unsigned long addr, u64 *pfns, unsigned long npages) + unsigned long addr, u64 *pfns, unsigned long npages, + unsigned int page_shift) { struct nouveau_pfnmap_args *args = nouveau_pfns_to_args(pfns); args->p.addr = addr; - args->p.size = npages << PAGE_SHIFT; + args->p.size = npages << page_shift; + args->p.page = page_shift; mutex_lock(&svmm->mutex); diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.h b/drivers/gpu/drm/nouveau/nouveau_svm.h index e7d63d7f0c2d..3fd78662f17e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.h +++ b/drivers/gpu/drm/nouveau/nouveau_svm.h @@ -33,7 +33,8 @@ void nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit); u64 *nouveau_pfns_alloc(unsigned long npages); void nouveau_pfns_free(u64 *pfns); void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm, - unsigned long addr, u64 *pfns, unsigned long npages); + unsigned long addr, u64 *pfns, unsigned long npages, + unsigned int page_shift); #else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ static inline void nouveau_svm_init(struct nouveau_drm *drm) {} static inline void nouveau_svm_fini(struct nouveau_drm *drm) {} From 2a1351cd4176ee1809b0900d386919d03b7652f8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 8 Oct 2025 09:54:52 +0000 Subject: [PATCH 238/321] mm/huge_memory: add pmd folio to ds_queue in do_huge_zero_wp_pmd() We add pmd folio into ds_queue on the first page fault in __do_huge_pmd_anonymous_page(), so that we can split it in case of memory pressure. This should be the same for a pmd folio during wp page fault. Commit 1ced09e0331f ("mm: allocate THP on hugezeropage wp-fault") miss to add it to ds_queue, which means system may not reclaim enough memory in case of memory pressure even the pmd folio is under used. Move deferred_split_folio() into map_anon_folio_pmd() to make the pmd folio installation consistent. Link: https://lkml.kernel.org/r/20251008095453.18772-2-richard.weiyang@gmail.com Fixes: 1ced09e0331f ("mm: allocate THP on hugezeropage wp-fault") Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Lance Yang Reviewed-by: Dev Jain Acked-by: Usama Arif Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81e511f1ed26..a2a2fda2bff8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1233,6 +1233,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + deferred_split_folio(folio, false); } static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) @@ -1273,7 +1274,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); - deferred_split_folio(folio, false); spin_unlock(vmf->ptl); } From ac7756771a34f19c9a757eb86efe028e51f57b23 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 8 Oct 2025 09:54:53 +0000 Subject: [PATCH 239/321] mm/khugepaged: unify pmd folio installation with map_anon_folio_pmd() Currently we install pmd folio with map_anon_folio_pmd() in __do_huge_pmd_anonymous_page() and do_huge_zero_wp_pmd(). While in collapse_huge_page(), it is done with identical code except statistics adjustment. Unify the process with map_anon_folio_pmd() to install pmd folio. Split it to map_anon_folio_pmd_pf() and map_anon_folio_pmd_nopf() to be used in page fault or not respectively. No functional change is intended. [akpm@linux-foundation.org: remove unneeded map_anon_folio_pmd_nopf() stub, per Wei & David] Link: https://lkml.kernel.org/r/20251008095453.18772-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Lance Yang Cc: David Hildenbrand Cc: Lance Yang Cc: Dev Jain Cc: Zi Yan Cc: Usama Arif Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 14 ++++++++++---- mm/khugepaged.c | 9 +-------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ed99e6bd31ac..396d9e3d1d46 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -533,6 +533,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2a2fda2bff8..05bf419513ad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1218,7 +1218,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, return folio; } -static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, struct vm_area_struct *vma, unsigned long haddr) { pmd_t entry; @@ -1229,11 +1229,17 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, folio_add_lru_vma(folio, vma); set_pmd_at(vma->vm_mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, haddr, pmd); + deferred_split_folio(folio, false); +} + +static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + map_anon_folio_pmd_nopf(folio, pmd, vma, haddr); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); - deferred_split_folio(folio, false); } static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) @@ -1272,7 +1278,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); } @@ -1944,7 +1950,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) if (ret) goto release; (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); goto unlock; release: folio_put(folio); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1b5c2e942df9..af1c162c9a94 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1226,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = folio_mk_pmd(folio, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - deferred_split_folio(folio, false); + map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; From d87f4a8f19668cdc5b8afd0d751e9d9c6a1b7595 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 10 Oct 2025 14:11:42 +0000 Subject: [PATCH 240/321] mm/huge_memory: only get folio_order() once during __folio_split() Before splitting folio, its order keeps the same. It is only necessary to get folio_order() once. Also rename order to old_order to represent the original folio order. Link: https://lkml.kernel.org/r/20251010141142.1349-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Lance Yang Acked-by: David Hildenbrand Cc: Zi Yan Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Barry Song Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 05bf419513ad..30d6afc79016 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3682,7 +3682,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; - int order = folio_order(folio); + int old_order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; int remap_flags = 0; @@ -3706,7 +3706,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (!is_anon && !folio->mapping) return -EBUSY; - if (new_order >= folio_order(folio)) + if (new_order >= old_order) return -EINVAL; if (uniform_split && !uniform_split_supported(folio, new_order, true)) @@ -3764,7 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (uniform_split) { xas_set_order(&xas, folio->index, new_order); - xas_split_alloc(&xas, folio, folio_order(folio), gfp); + xas_split_alloc(&xas, folio, old_order, gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -3820,13 +3820,13 @@ static int __folio_split(struct folio *folio, unsigned int new_order, struct lruvec *lruvec; int expected_refs; - if (folio_order(folio) > 1 && + if (old_order > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + mod_mthp_stat(old_order, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } /* * Reinitialize page_deferred_list after removing the @@ -3954,7 +3954,7 @@ fail: if (!ret && is_anon && !folio_is_device_private(folio)) remap_flags = RMP_USE_SHARED_ZEROPAGE; - remap_page(folio, 1 << order, remap_flags); + remap_page(folio, 1 << old_order, remap_flags); /* * Unlock all after-split folios except the one containing @@ -3985,9 +3985,9 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (order == HPAGE_PMD_ORDER) + if (old_order == HPAGE_PMD_ORDER) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); - count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); + count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret; } From a7ef12c64fd991c0f42b2e1bf0c4f09068575864 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:19:59 -0400 Subject: [PATCH 241/321] mm/huge_memory: add split_huge_page_to_order() Patch series "Optimize folio split in memory failure", v5. This patchset optimizes folio split operations in memory failure code by always splitting a folio to min_order_for_split() to minimize unusable pages, even if min_order_for_split() is non zero and memory failure code would take the failed path eventually for a successfully split folio. This means instead of making the entire original folio unusable memory failure code would only make its after-split folio, which has order of min_order_for_split() and contains HWPoison page, unusable. For soft offline case, since the original folio is still accessible, no split is performed if the folio cannot be split to order-0 to prevent potential performance loss. In addition, add split_huge_page_to_order() to improve code readability and fix kernel-doc comment format for folio_split() and other related functions. Background ========== This patchset is a follow-up of "[PATCH v3] mm/huge_memory: do not change split_huge_page*() target order silently."[1] and [PATCH v4] mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 order[2], since both are separated out as hotfixes. It improves how memory failure code handles large block size(LBS) folios with min_order_for_split() > 0. By splitting a large folio containing HW poisoned pages to min_order_for_split(), the after-split folios without HW poisoned pages could be freed for reuse. To achieve this, folio split code needs to set has_hwpoisoned on after-split folios containing HW poisoned pages and it is done in the hotfix in [2]. This patchset includes: 1. A patch adds split_huge_page_to_order(), 2. Patch 2 and Patch 3 of "[PATCH v2 0/3] Do not change split folio target order"[3], This patch (of 3): When the caller does not supply a list to split_huge_page_to_list_to_order(), use split_huge_page_to_order() instead. Link: https://lkml.kernel.org/r/20251031162001.670503-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20251031162001.670503-2-ziy@nvidia.com Link: https://lore.kernel.org/all/20251017013630.139907-1-ziy@nvidia.com/ [1] Link: https://lore.kernel.org/all/20251023030521.473097-1-ziy@nvidia.com/ [2] Link: https://lore.kernel.org/all/20251016033452.125479-1-ziy@nvidia.com/ [3] Signed-off-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Reviewed-by: Miaohe Lin Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 396d9e3d1d46..a06924cf4065 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -381,6 +381,10 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct lis { return __split_huge_page_to_list_to_order(page, list, new_order, false); } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + return split_huge_page_to_list_to_order(page, NULL, new_order); +} /* * try_folio_split_to_order - try to split a @folio at @page to @new_order using @@ -400,8 +404,7 @@ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) - return split_huge_page_to_list_to_order(&folio->page, NULL, - new_order); + return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) @@ -587,6 +590,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_WARN_ON_ONCE_PAGE(1, page); return -EINVAL; } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; +} static inline int split_huge_page(struct page *page) { VM_WARN_ON_ONCE_PAGE(1, page); From 689b8986776c823161fb4955cc7dc303f78a1962 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:20:00 -0400 Subject: [PATCH 242/321] mm/memory-failure: improve large block size folio handling Large block size (LBS) folios cannot be split to order-0 folios but min_order_for_folio(). Current split fails directly, but that is not optimal. Split the folio to min_order_for_folio(), so that, after split, only the folio containing the poisoned page becomes unusable instead. For soft offline, do not split the large folio if its min_order_for_folio() is not 0. Since the folio is still accessible from userspace and premature split might lead to potential performance loss. Link: https://lkml.kernel.org/r/20251031162001.670503-3-ziy@nvidia.com Signed-off-by: Zi Yan Suggested-by: Jane Chu Reviewed-by: Luis Chamberlain Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Wei Yang Reviewed-by: Miaohe Lin Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 77391b6f9f76..1f7fb9bf287a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1659,12 +1659,13 @@ static int identify_page_state(unsigned long pfn, struct page *p, * there is still more to do, hence the page refcount we took earlier * is still needed. */ -static int try_to_split_thp_page(struct page *page, bool release) +static int try_to_split_thp_page(struct page *page, unsigned int new_order, + bool release) { int ret; lock_page(page); - ret = split_huge_page(page); + ret = split_huge_page_to_order(page, new_order); unlock_page(page); if (ret && release) @@ -2420,6 +2421,9 @@ try_again: folio_unlock(folio); if (folio_test_large(folio)) { + const int new_order = min_order_for_split(folio); + int err; + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2434,7 +2438,16 @@ try_again: * page is a valid handlable page. */ folio_set_has_hwpoisoned(folio); - if (try_to_split_thp_page(p, false) < 0) { + err = try_to_split_thp_page(p, new_order, /* release= */ false); + /* + * If splitting a folio to order-0 fails, kill the process. + * Split the folio regardless to minimize unusable pages. + * Because the memory failure code cannot handle large + * folios, this split is always treated as if it failed. + */ + if (err || new_order) { + /* get folio again in case the original one is split */ + folio = page_folio(p); res = -EHWPOISON; kill_procs_now(p, pfn, flags, folio); put_page(p); @@ -2761,7 +2774,17 @@ static int soft_offline_in_use_page(struct page *page) }; if (!huge && folio_test_large(folio)) { - if (try_to_split_thp_page(page, true)) { + const int new_order = min_order_for_split(folio); + + /* + * If new_order (target split order) is not 0, do not split the + * folio at all to retain the still accessible large folio. + * NOTE: if minimizing the number of soft offline pages is + * preferred, split it to non-zero new_order like it is done in + * memory_failure(). + */ + if (new_order || try_to_split_thp_page(page, /* new_order= */ 0, + /* release= */ true)) { pr_info("%#lx: thp split failed\n", pfn); return -EBUSY; } From 50d0598cf2c9d33e1f08c3b1a357752ea8a9b94a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:20:01 -0400 Subject: [PATCH 243/321] mm/huge_memory: fix kernel-doc comments for folio_split() and related try_folio_split_to_order(), folio_split, __folio_split(), and __split_unmapped_folio() do not have correct kernel-doc comment format. Fix them. [ziy@nvidia.com: kernel-doc fixup] Link: https://lkml.kernel.org/r/BE7AC5F3-9E64-4923-861D-C2C4E0CB91EB@nvidia.com [ziy@nvidia.com: add newline to fix an error and a warning from docutils] Link: https://lkml.kernel.org/r/040B38C0-23C6-4AEA-B069-69AE6DAA828B@nvidia.com Link: https://lkml.kernel.org/r/20251031162001.670503-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++---- mm/huge_memory.c | 52 ++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a06924cf4065..9f7f7d772fe5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -386,9 +386,9 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o return split_huge_page_to_list_to_order(page, NULL, new_order); } -/* - * try_folio_split_to_order - try to split a @folio at @page to @new_order using - * non uniform split. +/** + * try_folio_split_to_order() - try to split a @folio at @page to @new_order + * using non uniform split. * @folio: folio to be split * @page: split to @new_order at the given page * @new_order: the target split order @@ -398,7 +398,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o * folios are put back to LRU list. Use min_order_for_split() to get the lower * bound of @new_order. * - * Return: 0: split is successful, otherwise split failed. + * Return: 0 - split is successful, otherwise split failed. */ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) @@ -483,6 +483,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test + * + * Return: true - @folio can be mapped, false - @folio cannot be mapped. */ static inline bool folio_test_pmd_mappable(struct folio *folio) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 30d6afc79016..3d87127c02cf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3493,8 +3493,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order, ClearPageCompound(&folio->page); } -/* - * It splits an unmapped @folio to lower order smaller folios in two ways. +/** + * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in + * two ways: uniform split or non-uniform split. * @folio: the to-be-split folio * @new_order: the smallest order of the after split folios (since buddy * allocator like split generates folios with orders from @folio's @@ -3511,26 +3512,27 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * uniform_split is true. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half - * until the given @page's order becomes @new_order. This is done when + * until the given @folio's order becomes @new_order. This is done when * uniform_split is false. * * The high level flow for these two methods are: - * 1. uniform split: a single __split_folio_to_order() is called to split the - * @folio into @new_order, then we traverse all the resulting folios one by - * one in PFN ascending order and perform stats, unfreeze, adding to list, - * and file mapping index operations. - * 2. non-uniform split: in general, folio_order - @new_order calls to - * __split_folio_to_order() are made in a for loop to split the @folio - * to one lower order at a time. The resulting small folios are processed - * like what is done during the traversal in 1, except the one containing - * @page, which is split in next for loop. + * + * 1. uniform split: @xas is split with no expectation of failure and a single + * __split_folio_to_order() is called to split the @folio into @new_order + * along with stats update. + * 2. non-uniform split: folio_order - @new_order calls to + * __split_folio_to_order() are expected to be made in a for loop to split + * the @folio to one lower order at a time. The folio containing @split_at + * is split in each iteration. @xas is split into half in each iteration and + * can fail. A failed @xas split leaves split folios as is without merging + * them back. * * After splitting, the caller's folio reference will be transferred to the - * folio containing @page. The caller needs to unlock and/or free after-split - * folios if necessary. + * folio containing @split_at. The caller needs to unlock and/or free + * after-split folios if necessary. * - * For !uniform_split, when -ENOMEM is returned, the original folio might be - * split. The caller needs to check the input folio. + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, @@ -3650,8 +3652,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, return true; } -/* - * __folio_split: split a folio at @split_at to a @new_order folio +/** + * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio @@ -3669,7 +3671,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * 1. for uniform split, @lock_at points to one of @folio's subpages; * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) */ static int __folio_split(struct folio *folio, unsigned int new_order, @@ -4047,14 +4049,13 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list unmapped); } -/* - * folio_split: split a folio at @split_at to a @new_order folio +/** + * folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio - * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be - * split but not to @new_order, the caller needs to check) + * @list: after-split folios are added to @list if not null, otherwise to LRU + * list * * It has the same prerequisites and returns as * split_huge_page_to_list_to_order(). @@ -4068,6 +4069,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. * * After split, folio is left locked for caller. + * + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) From fe9d31fd1ab6cb53e45d9d6b0bb7a62d8365fe2b Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Sat, 8 Nov 2025 19:53:46 +0800 Subject: [PATCH 244/321] mm/hmm/test: fix error handling in dmirror_device_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dmirror_device_init() calls device_initialize() which sets the device reference count to 1, but fails to call put_device() when error occurs after dev_set_name() or cdev_device_add() failures. This results in memory leaks of struct device objects. Additionally, dmirror_device_remove() lacks the final put_device() call to properly release the device reference. Found by code review. Link: https://lkml.kernel.org/r/20251108115346.6368-1-make24@iscas.ac.cn Fixes: 6a760f58c792 ("mm/hmm/test: use char dev with struct device to get device node") Signed-off-by: Ma Ke Cc: Haoxiang Li Cc: Jason Gunthorpe Cc: John Hubbard Cc: Leon Romanovsky Cc: Mika Penttilä Signed-off-by: Andrew Morton --- lib/test_hmm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 72a8b2f38d8a..8af169d3873a 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1740,20 +1740,25 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id); if (ret) - return ret; + goto put_device; ret = cdev_device_add(&mdevice->cdevice, &mdevice->device); if (ret) - return ret; + goto put_device; /* Build a list of free ZONE_DEVICE struct pages */ return dmirror_allocate_chunk(mdevice, NULL, false); + +put_device: + put_device(&mdevice->device); + return ret; } static void dmirror_device_remove(struct dmirror_device *mdevice) { dmirror_device_remove_chunks(mdevice); cdev_device_del(&mdevice->cdevice, &mdevice->device); + put_device(&mdevice->device); } static int __init hmm_dmirror_init(void) From c467061fbb6eb483d59f546c145b2ff2249455e4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Nov 2025 03:41:54 +0000 Subject: [PATCH 245/321] mm/huge_memory: introduce enum split_type for clarity Patch series "mm/huge_memory: Define split_type and consolidate split support checks", v3. This two-patch series focuses on improving code clarity and removing redundancy in the huge memory handling logic related to folio splitting. The series is based on an original proposal to merge two significantly identical functions that check folio split support[1]. During this process, we found an opportunity to improve readability by explicitly defining the split types. Patch 1: define split_type and use it Patch 2: merge uniform_split_supported() and non_uniform_split_supported() This patch (of 2): We currently handle two distinct types of large folio splitting: * uniform split * non-uniform split Differentiating between these types using a simple boolean variable is not obvious and can harm code readability. This commit introduces enum split_type to explicitly define these two types. Replacing the existing boolean variable with this enumeration significantly improves code clarity and expressiveness when dealing with folio splitting logic. No functional change is expected. [akpm@linux-foundation.org: tweak layout, per David] Link: https://lkml.kernel.org/r/20251106034155.21398-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251106034155.21398-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: "David Hildenbrand (Red Hat)" Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/huge_memory.c | 30 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9f7f7d772fe5..b74708dc5b5f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -364,6 +364,11 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); +enum split_type { + SPLIT_TYPE_UNIFORM, + SPLIT_TYPE_NON_UNIFORM, +}; + bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order, bool unmapped); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d87127c02cf..4118f330c55e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3504,16 +3504,16 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * will be split until its order becomes @new_order. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping - * @uniform_split: if the split is uniform or not (buddy allocator like split) + * @split_type: if the split is uniform or not (buddy allocator like split) * * * 1. uniform split: the given @folio into multiple @new_order small folios, * where all small folios have the same order. This is done when - * uniform_split is true. + * split_type is SPLIT_TYPE_UNIFORM. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half * until the given @folio's order becomes @new_order. This is done when - * uniform_split is false. + * split_type is SPLIT_TYPE_NON_UNIFORM. * * The high level flow for these two methods are: * @@ -3536,11 +3536,11 @@ static void __split_folio_to_order(struct folio *folio, int old_order, */ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, - struct address_space *mapping, bool uniform_split) + struct address_space *mapping, enum split_type split_type) { const bool is_anon = folio_test_anon(folio); int old_order = folio_order(folio); - int start_order = uniform_split ? new_order : old_order - 1; + int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1; int split_order; /* @@ -3562,7 +3562,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, * irq is disabled to allocate enough memory, whereas * non-uniform split can handle ENOMEM. */ - if (uniform_split) + if (split_type == SPLIT_TYPE_UNIFORM) xas_split(xas, folio, old_order); else { xas_set_order(xas, folio->index, split_order); @@ -3659,7 +3659,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * @split_at: a page within the new folio * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL - * @uniform_split: perform uniform split or not (non-uniform split) + * @split_type: perform uniform split or not (non-uniform split) * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. @@ -3676,7 +3676,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, bool uniform_split, bool unmapped) + struct list_head *list, enum split_type split_type, bool unmapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); @@ -3711,10 +3711,10 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (new_order >= old_order) return -EINVAL; - if (uniform_split && !uniform_split_supported(folio, new_order, true)) + if (split_type == SPLIT_TYPE_UNIFORM && !uniform_split_supported(folio, new_order, true)) return -EINVAL; - if (!uniform_split && + if (split_type == SPLIT_TYPE_NON_UNIFORM && !non_uniform_split_supported(folio, new_order, true)) return -EINVAL; @@ -3764,7 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out; } - if (uniform_split) { + if (split_type == SPLIT_TYPE_UNIFORM) { xas_set_order(&xas, folio->index, new_order); xas_split_alloc(&xas, folio, old_order, gfp); if (xas_error(&xas)) { @@ -3869,7 +3869,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, lruvec = folio_lruvec_lock(folio); ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, uniform_split); + mapping, split_type); /* * Unfreeze after-split folios and put them back to the right @@ -4045,8 +4045,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list { struct folio *folio = page_folio(page); - return __folio_split(folio, new_order, &folio->page, page, list, true, - unmapped); + return __folio_split(folio, new_order, &folio->page, page, list, + SPLIT_TYPE_UNIFORM, unmapped); } /** @@ -4077,7 +4077,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - false, false); + SPLIT_TYPE_NON_UNIFORM, false); } int min_order_for_split(struct folio *folio) From 8a0e4bdddd1c998b894d879a1d22f1e745606215 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Nov 2025 03:41:55 +0000 Subject: [PATCH 246/321] mm/huge_memory: merge uniform_split_supported() and non_uniform_split_supported() uniform_split_supported() and non_uniform_split_supported() share significantly similar logic. The only functional difference is that uniform_split_supported() includes an additional check on the requested @new_order. The reason for this check comes from the following two aspects: * some file system or swap cache just supports order-0 folio * the behavioral difference between uniform/non-uniform split The behavioral difference between uniform split and non-uniform: * uniform split splits folio directly to @new_order * non-uniform split creates after-split folios with orders from folio_order(folio) - 1 to new_order. This means for non-uniform split or !new_order split we should check the file system and swap cache respectively. This commit unifies the logic and merge the two functions into a single combined helper, removing redundant code and simplifying the split support checking mechanism. Link: https://lkml.kernel.org/r/20251106034155.21398-3-richard.weiyang@gmail.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: Zi Yan Cc: "David Hildenbrand (Red Hat)" Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++--- mm/huge_memory.c | 71 +++++++++++++++++------------------------ 2 files changed, 33 insertions(+), 46 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b74708dc5b5f..19d4a5f52ca2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -374,10 +374,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list unsigned int new_order, bool unmapped); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); @@ -408,7 +406,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { - if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) + if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false)) return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4118f330c55e..d79a4bb363de 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3593,8 +3593,8 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, return 0; } -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns) { if (folio_test_anon(folio)) { /* order-1 is not supported for anonymous THP. */ @@ -3602,48 +3602,41 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, "Cannot split to order-1 folio"); if (new_order == 1) return false; - } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - /* - * No split if the file system does not support large folio. - * Note that we might still have THPs in such mappings due to - * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping - * does not actually support large folios properly. - */ - VM_WARN_ONCE(warns, - "Cannot split file folio to non-0 order"); - return false; - } - - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) { - VM_WARN_ONCE(warns, - "Cannot split swapcache folio to non-0 order"); - return false; - } - - return true; -} - -/* See comments in non_uniform_split_supported() */ -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) -{ - if (folio_test_anon(folio)) { - VM_WARN_ONCE(warns && new_order == 1, - "Cannot split to order-1 folio"); - if (new_order == 1) - return false; - } else if (new_order) { + } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) { if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { + /* + * We can always split a folio down to a single page + * (new_order == 0) uniformly. + * + * For any other scenario + * a) uniform split targeting a large folio + * (new_order > 0) + * b) any non-uniform split + * we must confirm that the file system supports large + * folios. + * + * Note that we might still have THPs in such + * mappings, which is created from khugepaged when + * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that + * case, the mapping does not actually support large + * folios properly. + */ VM_WARN_ONCE(warns, "Cannot split file folio to non-0 order"); return false; } } - if (new_order && folio_test_swapcache(folio)) { + /* + * swapcache folio could only be split to order 0 + * + * non-uniform split creates after-split folios with orders from + * folio_order(folio) - 1 to new_order, making it not suitable for any + * swapcache folio split. Only uniform split to order-0 can be used + * here. + */ + if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) { VM_WARN_ONCE(warns, "Cannot split swapcache folio to non-0 order"); return false; @@ -3711,11 +3704,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (new_order >= old_order) return -EINVAL; - if (split_type == SPLIT_TYPE_UNIFORM && !uniform_split_supported(folio, new_order, true)) - return -EINVAL; - - if (split_type == SPLIT_TYPE_NON_UNIFORM && - !non_uniform_split_supported(folio, new_order, true)) + if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true)) return -EINVAL; is_hzp = is_huge_zero_folio(folio); From c093cf451094a9a03c4d4929bc30122a53038b7b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:19 +0000 Subject: [PATCH 247/321] mm: correctly handle UFFD PTE markers Patch series "mm: remove is_swap_[pte, pmd]() + non-swap entries, introduce leaf entries", v3. There's an established convention in the kernel that we treat leaf page tables (so far at the PTE, PMD level) as containing 'swap entries' should they be neither empty (i.e. p**_none() evaluating true) nor present (i.e. p**_present() evaluating true). However, at the same time we also have helper predicates - is_swap_pte(), is_swap_pmd() - which are inconsistently used. This is problematic, as it is logical to assume that should somebody wish to operate upon a page table swap entry they should first check to see if it is in fact one. It also implies that perhaps, in future, we might introduce a non-present, none page table entry that is not a swap entry. This series resolves this issue by systematically eliminating all use of the is_swap_pte() and is swap_pmd() predicates so we retain only the convention that should a leaf page table entry be neither none nor present it is a swap entry. We also have the further issue that 'swap entry' is unfortunately a really rather overloaded term and in fact refers to both entries for swap and for other information such as migration entries, page table markers, and device private entries. We therefore have the rather 'unique' concept of a 'non-swap' swap entry. This series therefore introduces the concept of 'software leaf entries', of type softleaf_t, to eliminate this confusion. A software leaf entry in this sense is any page table entry which is non-present, and represented by the softleaf_t type. That is - page table leaf entries which are software-controlled by the kernel. This includes 'none' or empty entries, which are simply represented by an zero leaf entry value. In order to maintain compatibility as we transition the kernel to this new type, we simply typedef swp_entry_t to softleaf_t. We introduce a number of predicates and helpers to interact with software leaf entries in include/linux/leafops.h which, as it imports swapops.h, can be treated as a drop-in replacement for swapops.h wherever leaf entry helpers are used. Since softleaf_from_[pte, pmd]() treats present entries as they were empty/none leaf entries, this allows for a great deal of simplification of code throughout the code base, which this series utilises a great deal. We additionally change from swap entry to software leaf entry handling where it makes sense to and eliminate functions from swapops.h where software leaf entries obviate the need for the functions. This patch (of 16): PTE markers were previously only concerned with UFFD-specific logic - that is, PTE entries with the UFFD WP marker set or those marked via UFFDIO_POISON. However since the introduction of guard markers in commit 7c53dfbdb024 ("mm: add PTE_MARKER_GUARD PTE marker"), this has no longer been the case. Issues have been avoided as guard regions are not permitted in conjunction with UFFD, but it still leaves very confusing logic in place, most notably the misleading and poorly named pte_none_mostly() and huge_pte_none_mostly(). This predicate returns true for PTE entries that ought to be treated as none, but only in certain circumstances, and on the assumption we are dealing with H/W poison markers or UFFD WP markers. This patch removes these functions and makes each invocation of these functions instead explicitly check what it needs to check. As part of this effort it introduces is_uffd_pte_marker() to explicitly determine if a marker in fact is used as part of UFFD or not. In the HMM logic we note that the only time we would need to check for a fault is in the case of a UFFD WP marker, otherwise we simply encounter a fault error (VM_FAULT_HWPOISON for H/W poisoned marker, VM_FAULT_SIGSEGV for a guard marker), so only check for the UFFD WP case. While we're here we also refactor code to make it easier to understand. [akpm@linux-foundation.org: fix comment typo, per Mike] Link: https://lkml.kernel.org/r/cover.1762812360.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/c38625fd9a1c1f1cf64ae8a248858e45b3dcdf11.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 111 ++++++++++++++++++++-------------- include/asm-generic/hugetlb.h | 8 --- include/linux/swapops.h | 18 ------ include/linux/userfaultfd_k.h | 21 +++++++ mm/hmm.c | 7 ++- mm/hugetlb.c | 49 +++++++-------- mm/mincore.c | 17 +++++- mm/userfaultfd.c | 27 ++++++--- 8 files changed, 148 insertions(+), 110 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 54c6cc7fe9c6..94c4d68f0818 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, { struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; - bool ret = true; assert_fault_locked(vmf); ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) - goto out; + return true; - ret = false; pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (is_uffd_pte_marker(pte)) + return true; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. */ - if (huge_pte_none_mostly(pte)) - ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) - ret = true; -out: - return ret; + return true; + + return false; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { - return false; /* should never get here */ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; } #endif /* CONFIG_HUGETLB_PAGE */ /* - * Verify the pagetables are still not ok after having reigstered into + * Verify the pagetables are still not ok after having registered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different @@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pmd_t *pmd, _pmd; pte_t *pte; pte_t ptent; - bool ret = true; + bool ret; assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; + return true; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) - goto out; + return true; pud = pud_offset(p4d, address); if (!pud_present(*pud)) - goto out; + return true; pmd = pmd_offset(pud, address); again: _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) + return true; + + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ + if (!pmd_present(_pmd)) + return false; + + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); + + pte = pte_offset_map(pmd, address); + if (!pte) + goto again; + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + ptent = ptep_get(pte); + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (is_uffd_pte_marker(ptent)) + goto out; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) goto out; ret = false; - if (!pmd_present(_pmd)) - goto out; - - if (pmd_trans_huge(_pmd)) { - if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) - ret = true; - goto out; - } - - pte = pte_offset_map(pmd, address); - if (!pte) { - ret = true; - goto again; - } - /* - * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. - */ - ptent = ptep_get(pte); - if (pte_none_mostly(ptent)) - ret = true; - if (!pte_write(ptent) && (reason & VM_UFFD_WP)) - ret = true; - pte_unmap(pte); - out: + pte_unmap(pte); return ret; } @@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf, reason); - else + if (is_vm_hugetlb_page(vma)) { must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); - if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index dcb8727f2b82..e1a2e1b7c8e7 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -97,14 +97,6 @@ static inline int huge_pte_none(pte_t pte) } #endif -/* Please refer to comments above pte_none_mostly() for the usage */ -#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY -static inline int huge_pte_none_mostly(pte_t pte) -{ - return huge_pte_none(pte) || is_pte_marker(pte); -} -#endif - #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 2687928a8146..d1f665935cfc 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -469,24 +469,6 @@ static inline int is_guard_swp_entry(swp_entry_t entry) (pte_marker_get(entry) & PTE_MARKER_GUARD); } -/* - * This is a special version to check pte_none() just to cover the case when - * the pte is a pte marker. It existed because in many cases the pte marker - * should be seen as a none pte; it's just that we have stored some information - * onto the none pte so it becomes not-none any more. - * - * It should be used when the pte is file-backed, ram-based and backing - * userspace pages, like shmem. It is not needed upon pgtables that do not - * support pte markers at all. For example, it's not needed on anonymous - * memory, kernel-only memory (including when the system is during-boot), - * non-ram based generic file-system. It's fine to be used even there, but the - * extra pte marker check will be pure overhead. - */ -static inline int pte_none_mostly(pte_t pte) -{ - return pte_none(pte) || is_pte_marker(pte); -} - static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26a..da0b4fcc566f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -479,4 +479,25 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte) return false; } + +static inline bool is_uffd_pte_marker(pte_t pte) +{ + swp_entry_t entry; + + if (pte_present(pte)) + return false; + + entry = pte_to_swp_entry(pte); + if (!is_pte_marker_entry(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD handled. */ + if (pte_marker_entry_uffd_wp(entry)) + return true; + if (is_poisoned_swp_entry(entry)) + return true; + + return false; +} + #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/hmm.c b/mm/hmm.c index a56081d67ad6..387a38bbaf6a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -244,7 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t pfn_req_flags = *hmm_pfn; uint64_t new_pfn_flags = 0; - if (pte_none_mostly(pte)) { + /* + * Any other marker than a UFFD WP marker will result in a fault error + * that will be correctly handled, so we need only check for UFFD WP + * here. + */ + if (pte_none(pte) || pte_marker_uffd_wp(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 106e61f6e12c..96c991f54f7a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6037,29 +6037,28 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); - if (huge_pte_none_mostly(vmf.orig_pte)) { - if (is_pte_marker(vmf.orig_pte)) { - pte_marker marker = - pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); - - if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - goto out_mutex; - } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { - /* This isn't supported in hugetlb. */ - ret = VM_FAULT_SIGSEGV; - goto out_mutex; - } - } - + if (huge_pte_none(vmf.orig_pte)) /* - * Other PTE markers should be handled the same way as none PTE. - * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); + + if (is_pte_marker(vmf.orig_pte)) { + const pte_marker marker = + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); + + if (marker & PTE_MARKER_POISONED) { + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto out_mutex; + } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { + /* This isn't supported in hugetlb. */ + ret = VM_FAULT_SIGSEGV; + goto out_mutex; + } + + return hugetlb_no_page(mapping, &vmf); } ret = 0; @@ -6228,6 +6227,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; + pte_t dst_ptep; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); @@ -6367,13 +6367,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (folio_test_hwpoison(folio)) goto out_release_unlock; - /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. - */ ret = -EEXIST; - if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) + + dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + /* + * See comment about UFFD marker overwriting in + * mfill_atomic_install_pte(). + */ + if (!huge_pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) diff --git a/mm/mincore.c b/mm/mincore.c index 8ec4719370e1..fb80becd6119 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -32,11 +32,22 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, spinlock_t *ptl; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ - present = pte && !huge_pte_none_mostly(huge_ptep_get(walk->mm, addr, pte)); + if (!pte) { + present = 0; + } else { + const pte_t ptep = huge_ptep_get(walk->mm, addr, pte); + + if (huge_pte_none(ptep) || is_pte_marker(ptep)) + present = 0; + else + present = 1; + } + for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; @@ -175,8 +186,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t pte = ptep_get(ptep); step = 1; - /* We need to do cache lookup too for pte markers */ - if (pte_none_mostly(pte)) + /* We need to do cache lookup too for markers */ + if (pte_none(pte) || is_pte_marker(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 00122f42718c..cc4ce205bbec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); + pte_t dst_ptep; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, } ret = -EEXIST; + + dst_ptep = ptep_get(dst_pte); + /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * We are allowed to overwrite a UFFD pte marker: consider when both + * MISSING|WP registered, we firstly wr-protect a none pte which has no + * page cache page backing it, then access the page. */ - if (!pte_none_mostly(ptep_get(dst_pte))) + if (!pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -583,12 +587,15 @@ retry: goto out_unlock; } - if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && - !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { - err = -EEXIST; - hugetlb_vma_unlock_read(dst_vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - goto out_unlock; + if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + + if (!huge_pte_none(ptep) && !is_uffd_pte_marker(ptep)) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, From 68aa2fdbf57f769e552f472ddb762aba028a207e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:20 +0000 Subject: [PATCH 248/321] mm: introduce leaf entry type and use to simplify leaf entry logic The kernel maintains leaf page table entries which contain either: The kernel maintains leaf page table entries which contain either: - Nothing ('none' entries) - Present entries* - Everything else that will cause a fault which the kernel handles * Present entries are either entries the hardware can navigate without page fault or special cases like NUMA hint protnone or PMD with cleared present bit which contain hardware-valid entries modulo the present bit. In the 'everything else' group we include swap entries, but we also include a number of other things such as migration entries, device private entries and marker entries. Unfortunately this 'everything else' group expresses everything through a swp_entry_t type, and these entries are referred to swap entries even though they may well not contain a... swap entry. This is compounded by the rather mind-boggling concept of a non-swap swap entry (checked via non_swap_entry()) and the means by which we twist and turn to satisfy this. This patch lays the foundation for reducing this confusion. We refer to 'everything else' as a 'software-define leaf entry' or 'softleaf'. for short And in fact we scoop up the 'none' entries into this concept also so we are left with: - Present entries. - Softleaf entries (which may be empty). This allows for radical simplification across the board - one can simply convert any leaf page table entry to a leaf entry via softleaf_from_pte(). If the entry is present, we return an empty leaf entry, so it is assumed the caller is aware that they must differentiate between the two categories of page table entries, checking for the former via pte_present(). As a result, we can eliminate a number of places where we would otherwise need to use predicates to see if we can proceed with leaf page table entry conversion and instead just go ahead and do it unconditionally. We do so where we can, adjusting surrounding logic as necessary to integrate the new softleaf_t logic as far as seems reasonable at this stage. We typedef swp_entry_t to softleaf_t for the time being until the conversion can be complete, meaning everything remains compatible regardless of which type is used. We will eventually remove swp_entry_t when the conversion is complete. We introduce a new header file to keep things clear - leafops.h - this imports swapops.h so can direct replace swapops imports without issue, and we do so in all the files that require it. Additionally, add new leafops.h file to core mm maintainers entry. Link: https://lkml.kernel.org/r/c879383aac77d96a03e4d38f7daba893cd35fc76.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + fs/proc/task_mmu.c | 26 +-- fs/userfaultfd.c | 6 +- include/linux/leafops.h | 387 ++++++++++++++++++++++++++++++++++ include/linux/mm_inline.h | 6 +- include/linux/mm_types.h | 25 +++ include/linux/swapops.h | 28 --- include/linux/userfaultfd_k.h | 51 +---- mm/hmm.c | 2 +- mm/hugetlb.c | 37 ++-- mm/madvise.c | 16 +- mm/memory.c | 41 ++-- mm/mincore.c | 6 +- mm/mprotect.c | 6 +- mm/mremap.c | 4 +- mm/page_vma_mapped.c | 11 +- mm/shmem.c | 7 +- mm/userfaultfd.c | 6 +- 18 files changed, 502 insertions(+), 164 deletions(-) create mode 100644 include/linux/leafops.h diff --git a/MAINTAINERS b/MAINTAINERS index 6168d3aebdc1..5ca4caf73021 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16263,6 +16263,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/linux/gfp.h F: include/linux/gfp_types.h F: include/linux/highmem.h +F: include/linux/leafops.h F: include/linux/memory.h F: include/linux/mm.h F: include/linux/mm_*.h diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index db16ed91c269..5a1e897b0973 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -1231,11 +1231,11 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (is_pfn_swap_entry(swpent)) - folio = pfn_swap_entry_folio(swpent); + if (softleaf_has_pfn(entry)) + folio = softleaf_to_folio(entry); } if (folio) { @@ -1956,9 +1956,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SWAP; if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; - if (is_guard_swp_entry(entry)) + if (softleaf_is_guard_marker(entry)) flags |= PM_GUARD_REGION; } @@ -2331,18 +2331,18 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { - swp_entry_t swp; + softleaf_t entry; categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - swp = pte_to_swp_entry(pte); - if (is_guard_swp_entry(swp)) + entry = softleaf_from_pte(pte); + if (softleaf_is_guard_marker(entry)) categories |= PAGE_IS_GUARD; else if ((p->masks_of_interest & PAGE_IS_FILE) && - is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) + softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; if (pte_swp_soft_dirty(pte)) @@ -2467,7 +2467,7 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, { unsigned long psize; - if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + if (is_hugetlb_entry_hwpoisoned(ptent) || pte_is_marker(ptent)) return; psize = huge_page_size(hstate_vma(vma)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 94c4d68f0818..3f539aabc3b3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -251,7 +251,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, if (huge_pte_none(pte)) return true; /* UFFD PTE markers require userspace to resolve the fault. */ - if (is_uffd_pte_marker(pte)) + if (pte_is_uffd_marker(pte)) return true; /* * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to @@ -337,7 +337,7 @@ again: if (pte_none(ptent)) goto out; /* UFFD PTE markers require userspace to resolve the fault. */ - if (is_uffd_pte_marker(ptent)) + if (pte_is_uffd_marker(ptent)) goto out; /* * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to diff --git a/include/linux/leafops.h b/include/linux/leafops.h new file mode 100644 index 000000000000..cff9d94fd5d1 --- /dev/null +++ b/include/linux/leafops.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Describes operations that can be performed on software-defined page table + * leaf entries. These are abstracted from the hardware page table entries + * themselves by the softleaf_t type, see mm_types.h. + */ +#ifndef _LINUX_LEAFOPS_H +#define _LINUX_LEAFOPS_H + +#include +#include +#include + +#ifdef CONFIG_MMU + +/* Temporary until swp_entry_t eliminated. */ +#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT + +enum softleaf_type { + /* Fundamental types. */ + SOFTLEAF_NONE, + SOFTLEAF_SWAP, + /* Migration types. */ + SOFTLEAF_MIGRATION_READ, + SOFTLEAF_MIGRATION_READ_EXCLUSIVE, + SOFTLEAF_MIGRATION_WRITE, + /* Device types. */ + SOFTLEAF_DEVICE_PRIVATE_READ, + SOFTLEAF_DEVICE_PRIVATE_WRITE, + SOFTLEAF_DEVICE_EXCLUSIVE, + /* H/W posion types. */ + SOFTLEAF_HWPOISON, + /* Marker types. */ + SOFTLEAF_MARKER, +}; + +/** + * softleaf_mk_none() - Create an empty ('none') leaf entry. + * Returns: empty leaf entry. + */ +static inline softleaf_t softleaf_mk_none(void) +{ + return ((softleaf_t) { 0 }); +} + +/** + * softleaf_from_pte() - Obtain a leaf entry from a PTE entry. + * @pte: PTE entry. + * + * If @pte is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pte(pte_t pte) +{ + if (pte_present(pte) || pte_none(pte)) + return softleaf_mk_none(); + + /* Temporary until swp_entry_t eliminated. */ + return pte_to_swp_entry(pte); +} + +/** + * softleaf_is_none() - Is the leaf entry empty? + * @entry: Leaf entry. + * + * Empty entries are typically the result of a 'none' page table leaf entry + * being converted to a leaf entry. + * + * Returns: true if the entry is empty, false otherwise. + */ +static inline bool softleaf_is_none(softleaf_t entry) +{ + return entry.val == 0; +} + +/** + * softleaf_type() - Identify the type of leaf entry. + * @enntry: Leaf entry. + * + * Returns: the leaf entry type associated with @entry. + */ +static inline enum softleaf_type softleaf_type(softleaf_t entry) +{ + unsigned int type_num; + + if (softleaf_is_none(entry)) + return SOFTLEAF_NONE; + + type_num = entry.val >> LEAF_TYPE_SHIFT; + + if (type_num < MAX_SWAPFILES) + return SOFTLEAF_SWAP; + + switch (type_num) { +#ifdef CONFIG_MIGRATION + case SWP_MIGRATION_READ: + return SOFTLEAF_MIGRATION_READ; + case SWP_MIGRATION_READ_EXCLUSIVE: + return SOFTLEAF_MIGRATION_READ_EXCLUSIVE; + case SWP_MIGRATION_WRITE: + return SOFTLEAF_MIGRATION_WRITE; +#endif +#ifdef CONFIG_DEVICE_PRIVATE + case SWP_DEVICE_WRITE: + return SOFTLEAF_DEVICE_PRIVATE_WRITE; + case SWP_DEVICE_READ: + return SOFTLEAF_DEVICE_PRIVATE_READ; + case SWP_DEVICE_EXCLUSIVE: + return SOFTLEAF_DEVICE_EXCLUSIVE; +#endif +#ifdef CONFIG_MEMORY_FAILURE + case SWP_HWPOISON: + return SOFTLEAF_HWPOISON; +#endif + case SWP_PTE_MARKER: + return SOFTLEAF_MARKER; + } + + /* Unknown entry type. */ + VM_WARN_ON_ONCE(1); + return SOFTLEAF_NONE; +} + +/** + * softleaf_is_swap() - Is this leaf entry a swap entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a swap entry, otherwise false. + */ +static inline bool softleaf_is_swap(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_SWAP; +} + +/** + * softleaf_is_migration() - Is this leaf entry a migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a migration entry, otherwise false. + */ +static inline bool softleaf_is_migration(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_MIGRATION_READ: + case SOFTLEAF_MIGRATION_READ_EXCLUSIVE: + case SOFTLEAF_MIGRATION_WRITE: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_private() - Is this leaf entry a device private entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private entry, otherwise false. + */ +static inline bool softleaf_is_device_private(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_DEVICE_PRIVATE_WRITE: + case SOFTLEAF_DEVICE_PRIVATE_READ: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device exclusive entry, otherwise false. + */ +static inline bool softleaf_is_device_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE; +} + +/** + * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a hardware poison entry, otherwise false. + */ +static inline bool softleaf_is_hwpoison(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_HWPOISON; +} + +/** + * softleaf_is_marker() - Is this leaf entry a marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a marker entry, otherwise false. + */ +static inline bool softleaf_is_marker(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MARKER; +} + +/** + * softleaf_to_marker() - Obtain marker associated with leaf entry. + * @entry: Leaf entry, softleaf_is_marker(@entry) must return true. + * + * Returns: Marker associated with the leaf entry. + */ +static inline pte_marker softleaf_to_marker(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_marker(entry)); + + return swp_offset(entry) & PTE_MARKER_MASK; +} + +/** + * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number? + * @entry: Leaf entry. + * + * A pfn swap entry is a special type of swap entry that always has a pfn stored + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. + * + * Returns: true if the leaf entry encodes a PFN, otherwise false. + */ +static inline bool softleaf_has_pfn(softleaf_t entry) +{ + /* Make sure the swp offset can always store the needed fields. */ + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + + if (softleaf_is_migration(entry)) + return true; + if (softleaf_is_device_private(entry)) + return true; + if (softleaf_is_device_exclusive(entry)) + return true; + if (softleaf_is_hwpoison(entry)) + return true; + + return false; +} + +/** + * softleaf_to_pfn() - Obtain PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: The PFN associated with the leaf entry. + */ +static inline unsigned long softleaf_to_pfn(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return swp_offset_pfn(entry); +} + +/** + * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct page associated with the leaf entry's PFN. + */ +static inline struct page *softleaf_to_page(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return pfn_swap_entry_to_page(entry); +} + +/** + * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct folio associated with the leaf entry's PFN. + */ +static inline struct folio *softleaf_to_folio(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return pfn_swap_entry_folio(entry); +} + +/** + * softleaf_is_poison_marker() - Is this leaf entry a poison marker? + * @entry: Leaf entry. + * + * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific. + * + * Returns: true if the leaf entry is a poison marker, otherwise false. + */ +static inline bool softleaf_is_poison_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_POISONED; +} + +/** + * softleaf_is_guard_marker() - Is this leaf entry a guard region marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a guard marker, otherwise false. + */ +static inline bool softleaf_is_guard_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_GUARD; +} + +/** + * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect + * marker? + * @entry: Leaf entry. + * + * Userfaultfd-specific. + * + * Returns: true if the leaf entry is a UFFD WP marker, otherwise false. + */ +static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; +} + +/** + * pte_is_marker() - Does the PTE entry encode a marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a marker leaf entry, otherwise false. + */ +static inline bool pte_is_marker(pte_t pte) +{ + return softleaf_is_marker(softleaf_from_pte(pte)); +} + +/** + * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write + * protect marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false. + */ +static inline bool pte_is_uffd_wp_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + return softleaf_is_uffd_wp_marker(entry); +} + +/** + * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker + * leaf entry? + * @entry: Leaf entry. + * + * It's useful to be able to determine which leaf entries encode UFFD-specific + * markers so we can handle these correctly. + * + * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false. + */ +static inline bool pte_is_uffd_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + if (!softleaf_is_marker(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD-handled. */ + if (softleaf_is_uffd_wp_marker(entry)) + return true; + if (softleaf_is_poison_marker(entry)) + return true; + + return false; +} + +#endif /* CONFIG_MMU */ +#endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f6a2b2d20016..ca7a18351797 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( - swp_entry_t entry, struct vm_area_struct *dst_vma) + softleaf_t entry, struct vm_area_struct *dst_vma) { - pte_marker srcm = pte_marker_get(entry); + const pte_marker srcm = softleaf_to_marker(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5021047485a9..4f66a3206a63 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -285,6 +285,31 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * typedef softleaf_t - Describes a page table software leaf entry, abstracted + * from its architecture-specific encoding. + * + * Page table leaf entries are those which do not reference any descendent page + * tables but rather either reference a data page, are an empty (or 'none' + * entry), or contain a non-present entry. + * + * If referencing another page table or a data page then the page table entry is + * pertinent to hardware - that is it tells the hardware how to decode the page + * table entry. + * + * Otherwise it is a software-defined leaf page table entry, which this type + * describes. See leafops.h and specifically @softleaf_type for a list of all + * possible kinds of software leaf entry. + * + * A softleaf_t entry is abstracted from the hardware page table entry, so is + * not architecture-specific. + * + * NOTE: While we transition from the confusing swp_entry_t type used for this + * purpose, we simply alias this type. This will be removed once the + * transition is complete. + */ +typedef swp_entry_t softleaf_t; + #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) /* We have some extra room after the refcount in tail pages. */ #define NR_PAGES_IN_LARGE_FOLIO diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d1f665935cfc..0a4b3f51ecf5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -426,21 +426,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker) return swp_entry(SWP_PTE_MARKER, marker); } -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_PTE_MARKER; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return swp_offset(entry) & PTE_MARKER_MASK; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); -} - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -451,24 +436,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_poisoned_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_POISONED); - -} - static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline int is_guard_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_GUARD); -} - static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index da0b4fcc566f..983c860a00f1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -434,32 +434,6 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) return userfaultfd_wp_unpopulated(vma); } -static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); -#else - return false; -#endif -} - -static inline bool pte_marker_uffd_wp(pte_t pte) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - swp_entry_t entry; - - if (!is_swap_pte(pte)) - return false; - - entry = pte_to_swp_entry(pte); - - return pte_marker_entry_uffd_wp(entry); -#else - return false; -#endif -} - /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. @@ -473,31 +447,10 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte) if (pte_swp_uffd_wp(pte)) return true; - if (pte_marker_uffd_wp(pte)) + if (pte_is_uffd_wp_marker(pte)) return true; #endif return false; } - -static inline bool is_uffd_pte_marker(pte_t pte) -{ - swp_entry_t entry; - - if (pte_present(pte)) - return false; - - entry = pte_to_swp_entry(pte); - if (!is_pte_marker_entry(entry)) - return false; - - /* UFFD WP, poisoned swap entries are UFFD handled. */ - if (pte_marker_entry_uffd_wp(entry)) - return true; - if (is_poisoned_swp_entry(entry)) - return true; - - return false; -} - #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/hmm.c b/mm/hmm.c index 387a38bbaf6a..e350d0cc9d41 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -249,7 +249,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * that will be correctly handled, so we need only check for UFFD WP * here. */ - if (pte_none(pte) || pte_marker_uffd_wp(pte)) { + if (pte_none(pte) || pte_is_uffd_wp_marker(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 96c991f54f7a..12853cdefc9b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -4956,17 +4956,17 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_hugetlb_entry_migration(entry))) { - swp_entry_t swp_entry = pte_to_swp_entry(entry); + softleaf_t softleaf = softleaf_from_pte(entry); bool uffd_wp = pte_swp_uffd_wp(entry); - if (!is_readable_migration_entry(swp_entry) && cow) { + if (!is_readable_migration_entry(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ - swp_entry = make_readable_migration_entry( - swp_offset(swp_entry)); - entry = swp_entry_to_pte(swp_entry); + softleaf = make_readable_migration_entry( + swp_offset(softleaf)); + entry = swp_entry_to_pte(softleaf); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); @@ -4974,9 +4974,9 @@ again: if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_pte_marker(entry))) { - pte_marker marker = copy_pte_marker( - pte_to_swp_entry(entry), dst_vma); + } else if (unlikely(pte_is_marker(entry))) { + const softleaf_t softleaf = softleaf_from_pte(entry); + const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, @@ -5092,7 +5092,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); else { if (need_clear_uffd_wp) { @@ -5911,7 +5911,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) + if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); @@ -6044,9 +6044,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ return hugetlb_no_page(mapping, &vmf); - if (is_pte_marker(vmf.orig_pte)) { + if (pte_is_marker(vmf.orig_pte)) { const pte_marker marker = - pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); + softleaf_to_marker(softleaf_from_pte(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE | @@ -6374,7 +6374,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * See comment about UFFD marker overwriting in * mfill_atomic_install_pte(). */ - if (!huge_pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) + if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) @@ -6495,8 +6495,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { - swp_entry_t entry = pte_to_swp_entry(pte); - struct folio *folio = pfn_swap_entry_folio(entry); + softleaf_t entry = softleaf_from_pte(pte); + + struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { @@ -6516,14 +6517,14 @@ long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); - } else if (unlikely(is_pte_marker(pte))) { + } else if (unlikely(pte_is_marker(pte))) { /* * Do nothing on a poison marker; page is * corrupted, permissions do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ - if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) + if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { diff --git a/mm/madvise.c b/mm/madvise.c index 2a165e9beb5b..c8381b954235 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -690,17 +690,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * (page allocation + zeroing). */ if (!pte_present(ptent)) { - swp_entry_t entry; + softleaf_t entry = softleaf_from_pte(ptent); - entry = pte_to_swp_entry(ptent); - if (!non_swap_entry(entry)) { + if (softleaf_is_swap(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; @@ -1071,8 +1070,9 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) static bool is_guard_pte_marker(pte_t ptent) { - return is_swap_pte(ptent) && - is_guard_swp_entry(pte_to_swp_entry(ptent)); + const softleaf_t entry = softleaf_from_pte(ptent); + + return softleaf_is_guard_marker(entry); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 732414852570..0caf8c5c8c68 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include @@ -109,7 +109,7 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; - return pte_marker_uffd_wp(vmf->orig_pte); + return pte_is_uffd_wp_marker(vmf->orig_pte); } /* @@ -927,10 +927,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, { vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); + softleaf_t entry = softleaf_from_pte(orig_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -1016,7 +1016,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) @@ -1711,14 +1711,14 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *any_skipped) { - swp_entry_t entry; + softleaf_t entry; int nr = 1; *any_skipped = true; - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pte(ptent); + if (softleaf_is_device_private(entry) || + softleaf_is_device_exclusive(entry)) { + struct page *page = softleaf_to_page(entry); struct folio *folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) @@ -1733,7 +1733,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); - } else if (!non_swap_entry(entry)) { + } else if (softleaf_is_swap(entry)) { /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) return 1; @@ -1741,20 +1741,20 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); if (!should_zap_folio(details, folio)) return 1; rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (softleaf_is_uffd_wp_marker(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) return 1; - } else if (is_guard_swp_entry(entry)) { + } else if (softleaf_is_guard_marker(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers @@ -1762,7 +1762,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, */ if (!zap_drop_markers(details)) return 1; - } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { if (!should_zap_cows(details)) return 1; } else { @@ -4380,7 +4381,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. - * So is_pte_marker() check is not enough to safely drop the pte. + * So pte_is_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); @@ -4414,8 +4415,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { - swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); - unsigned long marker = pte_marker_get(entry); + const softleaf_t entry = softleaf_from_pte(vmf->orig_pte); + const pte_marker marker = softleaf_to_marker(entry); /* * PTE markers should never be empty. If anything weird happened, @@ -4432,7 +4433,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_is_uffd_wp_marker(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ @@ -4680,7 +4681,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); diff --git a/mm/mincore.c b/mm/mincore.c index fb80becd6119..b3682488a65d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -42,7 +42,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, } else { const pte_t ptep = huge_ptep_get(walk->mm, addr, pte); - if (huge_pte_none(ptep) || is_pte_marker(ptep)) + if (huge_pte_none(ptep) || pte_is_marker(ptep)) present = 0; else present = 1; @@ -187,7 +187,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, step = 1; /* We need to do cache lookup too for markers */ - if (pte_none(pte) || is_pte_marker(pte)) + if (pte_none(pte) || pte_is_marker(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index db93d3bb1a5e..918a64cc6033 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -326,14 +326,14 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { /* * Ignore error swap entries unconditionally, * because any access should sigbus/sigsegv * anyway. */ - if (is_poisoned_swp_entry(entry) || - is_guard_swp_entry(entry)) + if (softleaf_is_poison_marker(entry) || + softleaf_is_guard_marker(entry)) continue; /* * If this is uffd-wp pte marker and we'd like diff --git a/mm/mremap.c b/mm/mremap.c index 8ad06cf50783..7c21b2ad13f6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -288,7 +288,7 @@ static int move_ptes(struct pagetable_move_control *pmc, pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) pte_clear(mm, new_addr, new_ptep); else { if (need_clear_uffd_wp) { diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 137ce27ff68c..be20468fb5a9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include "internal.h" @@ -107,15 +107,12 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { - swp_entry_t entry; - if (!is_swap_pte(ptent)) - return false; - entry = pte_to_swp_entry(ptent); + const softleaf_t entry = softleaf_from_pte(ptent); - if (!is_migration_entry(entry)) + if (!softleaf_is_migration(entry)) return false; - pfn = swp_offset_pfn(entry); + pfn = softleaf_to_pfn(entry); } else if (is_swap_pte(ptent)) { swp_entry_t entry; diff --git a/mm/shmem.c b/mm/shmem.c index 6580f3cd24bb..395ca58ac4a5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include #include #include -#include +#include #include #include #include @@ -2286,7 +2286,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); - swp_entry_t swap, index_entry; + swp_entry_t swap; + softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; @@ -2298,7 +2299,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(index_entry)) + if (softleaf_is_poison_marker(index_entry)) return -EIO; si = get_swap_device(index_entry); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index cc4ce205bbec..055ec1050776 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -208,7 +208,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * MISSING|WP registered, we firstly wr-protect a none pte which has no * page cache page backing it, then access the page. */ - if (!pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) + if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -590,7 +590,7 @@ retry: if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); - if (!huge_pte_none(ptep) && !is_uffd_pte_marker(ptep)) { + if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { err = -EEXIST; hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); From fb888710e26a8a8a37dc0f8ed09a3c908c63eb71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:21 +0000 Subject: [PATCH 249/321] mm: avoid unnecessary uses of is_swap_pte() There's an established convention in the kernel that we treat PTEs as containing swap entries (and the unfortunately named non-swap swap entries) should they be neither empty (i.e. pte_none() evaluating true) nor present (i.e. pte_present() evaluating true). However, there is some inconsistency in how this is applied, as we also have the is_swap_pte() helper which explicitly performs this check: /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } As this represents a predicate, and it's logical to assume that in order to establish that a PTE entry can correctly be manipulated as a swap/non-swap entry, this predicate seems as if it must first be checked. But we instead, we far more often utilise the established convention of checking pte_none() / pte_present() before operating on entries as if they were swap/non-swap. This patch works towards correcting this inconsistency by removing all uses of is_swap_pte() where we are already in a position where we perform pte_none()/pte_present() checks anyway or otherwise it is clearly logical to do so. We also take advantage of the fact that pte_swp_uffd_wp() is only set on swap entries. Additionally, update comments referencing to is_swap_pte() and non_swap_entry(). No functional change intended. Link: https://lkml.kernel.org/r/17fd6d7f46a846517fd455fadd640af47fcd7c55.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 49 ++++++++++++++++++++++++----------- include/linux/userfaultfd_k.h | 3 +-- mm/hugetlb.c | 6 ++--- mm/internal.h | 6 ++--- mm/khugepaged.c | 29 +++++++++++---------- mm/migrate.c | 2 +- mm/mprotect.c | 43 ++++++++++++++---------------- mm/mremap.c | 7 +++-- mm/page_table_check.c | 13 ++++++---- mm/page_vma_mapped.c | 31 +++++++++++----------- 10 files changed, 104 insertions(+), 85 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5a1e897b0973..bf48fedaf128 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1017,7 +1017,9 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; - } else if (is_swap_pte(ptent)) { + } else if (pte_none(ptent)) { + smaps_pte_hole_lookup(addr, walk); + } else { swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { @@ -1038,9 +1040,6 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, present = true; page = pfn_swap_entry_to_page(swpent); } - } else { - smaps_pte_hole_lookup(addr, walk); - return; } if (!page) @@ -1612,6 +1611,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, */ pte_t ptent = ptep_get(pte); + if (pte_none(ptent)) + return; + if (pte_present(ptent)) { pte_t old_pte; @@ -1621,7 +1623,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { + } else { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } @@ -1924,6 +1926,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct page *page = NULL; struct folio *folio; + if (pte_none(pte)) + goto out; + if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); @@ -1933,8 +1938,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; - } else if (is_swap_pte(pte)) { + } else { swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) @@ -1942,6 +1948,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, entry = pte_to_swp_entry(pte); if (pm->show_pfn) { pgoff_t offset; + /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. @@ -1970,6 +1977,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } + +out: if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -2311,12 +2320,16 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long categories = 0; + unsigned long categories; + + if (pte_none(pte)) + return 0; if (pte_present(pte)) { struct page *page; - categories |= PAGE_IS_PRESENT; + categories = PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; @@ -2330,10 +2343,11 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { + } else { softleaf_t entry; - categories |= PAGE_IS_SWAPPED; + categories = PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; @@ -2361,12 +2375,12 @@ static void make_uffd_wp_pte(struct vm_area_struct *vma, old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { - ptent = pte_swp_mkuffd_wp(ptent); - set_pte_at(vma->vm_mm, addr, pte, ptent); - } else { + } else if (pte_none(ptent)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + } else { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } } @@ -2435,6 +2449,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; + if (pte_none(pte)) + return categories; + /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for @@ -2442,6 +2459,7 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) @@ -2450,8 +2468,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { + } else { categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 983c860a00f1..96b089dff4ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -441,9 +441,8 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) static inline bool pte_swp_uffd_wp_any(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP - if (!is_swap_pte(pte)) + if (pte_present(pte)) return false; - if (pte_swp_uffd_wp(pte)) return true; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 12853cdefc9b..59d91c36770c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5092,13 +5092,13 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) { huge_pte_clear(mm, new_addr, dst_pte, sz); - else { + } else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); diff --git a/mm/internal.h b/mm/internal.h index 2bad3971813b..a9b38cadb192 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -325,8 +325,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta - * @pte: The initial pte state; is_swap_pte(pte) must be true and - * non_swap_entry() must be false. + * @pte: The initial pte state; must be a swap entry * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * @@ -352,8 +351,7 @@ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. - * @pte: The initial pte state; is_swap_pte(pte) must be true and - * non_swap_entry() must be false. + * @pte: The initial pte state; must be a swap entry. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index af1c162c9a94..d7e71c2e2571 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1019,7 +1019,8 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, } vmf.orig_pte = ptep_get_lockless(pte); - if (!is_swap_pte(vmf.orig_pte)) + if (pte_none(vmf.orig_pte) || + pte_present(vmf.orig_pte)) continue; vmf.pte = pte; @@ -1276,7 +1277,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (is_swap_pte(pteval)) { + if (pte_none_or_zero(pteval)) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out_unmap; + } + } + if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { @@ -1296,18 +1309,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (pte_none_or_zero(pteval)) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out_unmap; - } - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small diff --git a/mm/migrate.c b/mm/migrate.c index d8f6cd14cdb7..847c1ec17628 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -492,7 +492,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, pte = ptep_get(ptep); pte_unmap(ptep); - if (!is_swap_pte(pte)) + if (pte_none(pte) || pte_present(pte)) goto out; entry = pte_to_swp_entry(pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index 918a64cc6033..aa555dfbdfc5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -297,7 +297,26 @@ static long change_pte_range(struct mmu_gather *tlb, prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); pages += nr_ptes; - } else if (is_swap_pte(oldpte)) { + } else if (pte_none(oldpte)) { + /* + * Nobody plays with any none ptes besides + * userfaultfd when applying the protections. + */ + if (likely(!uffd_wp)) + continue; + + if (userfaultfd_wp_use_markers(vma)) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. + */ + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } + } else { swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; @@ -358,28 +377,6 @@ static long change_pte_range(struct mmu_gather *tlb, set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } - } else { - /* It must be an none page, or what else?.. */ - WARN_ON_ONCE(!pte_none(oldpte)); - - /* - * Nobody plays with any none ptes besides - * userfaultfd when applying the protections. - */ - if (likely(!uffd_wp)) - continue; - - if (userfaultfd_wp_use_markers(vma)) { - /* - * For file-backed mem, we need to be able to - * wr-protect a none pte, because even if the - * pte is none, the page/swap cache could - * exist. Doing that by install a marker. - */ - set_pte_at(vma->vm_mm, addr, pte, - make_pte_marker(PTE_MARKER_UFFD_WP)); - pages++; - } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); diff --git a/mm/mremap.c b/mm/mremap.c index 7c21b2ad13f6..62b6827abacf 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -158,6 +158,9 @@ static void drop_rmap_locks(struct vm_area_struct *vma) static pte_t move_soft_dirty_pte(pte_t pte) { + if (pte_none(pte)) + return pte; + /* * Set soft dirty bit so we can notice * in userspace the ptes were moved. @@ -165,7 +168,7 @@ static pte_t move_soft_dirty_pte(pte_t pte) #ifdef CONFIG_MEM_SOFT_DIRTY if (pte_present(pte)) pte = pte_mksoft_dirty(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_mksoft_dirty(pte); #endif return pte; @@ -294,7 +297,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (need_clear_uffd_wp) { if (pte_present(pte)) pte = pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 4eeca782b888..43f75d2f7c36 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -185,12 +185,15 @@ static inline bool swap_cached_writable(swp_entry_t entry) is_writable_migration_entry(entry); } -static inline void page_table_check_pte_flags(pte_t pte) +static void page_table_check_pte_flags(pte_t pte) { - if (pte_present(pte) && pte_uffd_wp(pte)) - WARN_ON_ONCE(pte_write(pte)); - else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) - WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); + if (pte_present(pte)) { + WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte)); + } else if (pte_swp_uffd_wp(pte)) { + const swp_entry_t entry = pte_to_swp_entry(pte); + + WARN_ON_ONCE(swap_cached_writable(entry)); + } } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index be20468fb5a9..a4e23818f37f 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -16,6 +16,7 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, spinlock_t **ptlp) { + bool is_migration; pte_t ptent; if (pvmw->flags & PVMW_SYNC) { @@ -26,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, return !!pvmw->pte; } + is_migration = pvmw->flags & PVMW_MIGRATION; again: /* * It is important to return the ptl corresponding to pte, @@ -41,11 +43,14 @@ again: ptent = ptep_get(pvmw->pte); - if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(ptent)) + if (pte_none(ptent)) { + return false; + } else if (pte_present(ptent)) { + if (is_migration) return false; - } else if (is_swap_pte(ptent)) { + } else if (!is_migration) { swp_entry_t entry; + /* * Handle un-addressable ZONE_DEVICE memory. * @@ -66,8 +71,6 @@ again: if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; - } else if (!pte_present(ptent)) { - return false; } spin_lock(*ptlp); if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { @@ -113,21 +116,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) return false; pfn = softleaf_to_pfn(entry); - } else if (is_swap_pte(ptent)) { - swp_entry_t entry; + } else if (pte_present(ptent)) { + pfn = pte_pfn(ptent); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(ptent); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) + if (!softleaf_is_device_private(entry) && + !softleaf_is_device_exclusive(entry)) return false; - pfn = swp_offset_pfn(entry); - } else { - if (!pte_present(ptent)) - return false; - - pfn = pte_pfn(ptent); + pfn = softleaf_to_pfn(entry); } if ((pfn + pte_nr - 1) < pvmw->pfn) From 06fb61462bdea3288e391487beca07cb52d6881a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:22 +0000 Subject: [PATCH 250/321] mm: eliminate is_swap_pte() when softleaf_from_pte() suffices In cases where we can simply utilise the fact that softleaf_from_pte() treats present entries as if they were none entries and thus eliminate spurious uses of is_swap_pte(), do so. No functional change intended. Link: https://lkml.kernel.org/r/92ebab9567978155116804c67babc3c64636c403.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/internal.h | 7 +++---- mm/madvise.c | 8 +++----- mm/swap_state.c | 11 +++++------ mm/swapfile.c | 9 ++++----- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index a9b38cadb192..2ed041e6ebc3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -380,13 +380,12 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); - VM_WARN_ON(!is_swap_pte(pte)); - VM_WARN_ON(non_swap_entry(entry)); + VM_WARN_ON(!softleaf_is_swap(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { diff --git a/mm/madvise.c b/mm/madvise.c index c8381b954235..2d7dd7901bae 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; - swp_entry_t entry; + softleaf_t entry; struct folio *folio; if (!ptep++) { @@ -205,10 +205,8 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, } pte = ptep_get(ptep); - if (!is_swap_pte(pte)) - continue; - entry = pte_to_swp_entry(pte); - if (unlikely(non_swap_entry(entry))) + entry = softleaf_from_pte(pte); + if (unlikely(!softleaf_is_swap(entry))) continue; pte_unmap_unlock(ptep, ptl); diff --git a/mm/swap_state.c b/mm/swap_state.c index f4980dde5394..b2230f8a48fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -736,7 +736,6 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; - swp_entry_t entry; pgoff_t ilx; bool page_allocated; @@ -749,6 +748,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { struct swap_info_struct *si = NULL; + softleaf_t entry; if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); @@ -756,10 +756,9 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, break; } pentry = ptep_get_lockless(pte); - if (!is_swap_pte(pentry)) - continue; - entry = pte_to_swp_entry(pentry); - if (unlikely(non_swap_entry(entry))) + entry = softleaf_from_pte(pentry); + + if (!softleaf_is_swap(entry)) continue; pte_unmap(pte); pte = NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index 04435f7ae7bf..8c7f14061f5b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include "swap_table.h" #include "internal.h" @@ -2257,7 +2257,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct folio *folio; unsigned long offset; unsigned char swp_count; - swp_entry_t entry; + softleaf_t entry; int ret; pte_t ptent; @@ -2268,11 +2268,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptent = ptep_get_lockless(pte); + entry = softleaf_from_pte(ptent); - if (!is_swap_pte(ptent)) + if (!softleaf_is_swap(entry)) continue; - - entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; From fb410d8b89e89ef61b18326f07c477f563b631f6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:23 +0000 Subject: [PATCH 251/321] mm: use leaf entries in debug pgtable + remove is_swap_pte() Remove invocations of is_swap_pte() in mm/debug_vm_pgtable.c and use softleaf_from_pte() and softleaf_is_swap() as necessary to replace this usage. We update the test code to use a 'true' swap entry throughout so we are guaranteed this is not a non-swap entry, so all asserts continue to operate correctly. With this change in place, we no longer use is_swap_pte() anywhere, so remove it. Link: https://lkml.kernel.org/r/222f352e7a99191b4bdfa77e835f2fc0dd83fa72.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ------ mm/debug_vm_pgtable.c | 39 ++++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 0a4b3f51ecf5..a66ac4f2105c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -120,12 +120,6 @@ static inline unsigned long swp_offset_pfn(swp_entry_t entry) return swp_offset(entry) & SWP_PFN_MASK; } -/* check whether a pte points to a swap entry */ -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte); -} - /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 055e0e025b42..fff311830959 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -714,14 +714,16 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte; + softleaf_t entry; if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; pr_debug("Validating PTE swap soft dirty\n"); pte = swp_entry_to_pte(args->swp_entry); - WARN_ON(!is_swap_pte(pte)); + entry = softleaf_from_pte(pte); + WARN_ON(!softleaf_is_swap(entry)); WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte))); WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte))); } @@ -768,40 +770,47 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { - swp_entry_t entry, entry2; + swp_entry_t entry; + softleaf_t softleaf; pte_t pte; pr_debug("Validating PTE swap exclusive\n"); entry = args->swp_entry; pte = swp_entry_to_pte(entry); + softleaf = softleaf_from_pte(pte); + WARN_ON(pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(!softleaf_is_swap(softleaf)); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); pte = pte_swp_mkexclusive(pte); + softleaf = softleaf_from_pte(pte); + WARN_ON(!pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); + WARN_ON(!softleaf_is_swap(softleaf)); WARN_ON(pte_swp_soft_dirty(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); pte = pte_swp_clear_exclusive(pte); + softleaf = softleaf_from_pte(pte); + WARN_ON(pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(!softleaf_is_swap(softleaf)); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); } static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t arch_entry; + softleaf_t entry; pte_t pte1, pte2; pr_debug("Validating PTE swap\n"); pte1 = swp_entry_to_pte(args->swp_entry); - WARN_ON(!is_swap_pte(pte1)); + entry = softleaf_from_pte(pte1); + + WARN_ON(!softleaf_is_swap(entry)); arch_entry = __pte_to_swp_entry(pte1); pte2 = __swp_entry_to_pte(arch_entry); @@ -1218,8 +1227,8 @@ static int __init init_args(struct pgtable_debug_args *args) /* See generic_max_swapfile_size(): probe the maximum offset */ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); - /* Create a swp entry with all possible bits set */ - args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + /* Create a swp entry with all possible bits set while still being swap. */ + args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); /* * Allocate (huge) pages because some of the tests need to access From de4d6c94914f3659f0b51725e23e637d4e9f78cc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:24 +0000 Subject: [PATCH 252/321] fs/proc/task_mmu: refactor pagemap_pmd_range() Separate out THP logic so we can drop an indentation level and reduce the amount of noise in this function. We add pagemap_pmd_range_thp() for this purpose. While we're here, convert the VM_BUG_ON() to a VM_WARN_ON_ONCE() at the same time. No functional change intended. Link: https://lkml.kernel.org/r/f9ce7f3bb57e3627288225e23f2498cc5315f5ab.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 148 ++++++++++++++++++++++++--------------------- 1 file changed, 78 insertions(+), 70 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf48fedaf128..8c35ea48a93e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1985,6 +1985,81 @@ out: return make_pme(frame, flags); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct vm_area_struct *vma, + struct pagemapread *pm) +{ + unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + struct folio *folio = NULL; + int err = 0; + + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + + if (pmd_present(pmd)) { + page = pmd_page(pmd); + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + idx; + } else if (thp_migration_supported() && is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset; + + if (pm->show_pfn) { + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry) + idx; + else + offset = swp_offset(entry) + idx; + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + VM_WARN_ON_ONCE(!is_pmd_migration_entry(pmd)); + page = pfn_swap_entry_to_page(entry); + } + + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + } + + for (; addr != end; addr += PAGE_SIZE, idx++) { + u64 cur_flags = flags; + pagemap_entry_t pme; + + if (folio && (flags & PM_PRESENT) && + __folio_page_mapped_exclusively(folio, page)) + cur_flags |= PM_MMAP_EXCLUSIVE; + + pme = make_pme(frame, cur_flags); + err = add_to_pagemap(&pme, pm); + if (err) + break; + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } + } + return err; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1993,82 +2068,15 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { - unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; - u64 flags = 0, frame = 0; - pmd_t pmd = *pmdp; - struct page *page = NULL; - struct folio *folio = NULL; - - if (vma->vm_flags & VM_SOFTDIRTY) - flags |= PM_SOFT_DIRTY; - - if (pmd_present(pmd)) { - page = pmd_page(pmd); - - flags |= PM_PRESENT; - if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - if (pmd_uffd_wp(pmd)) - flags |= PM_UFFD_WP; - if (pm->show_pfn) - frame = pmd_pfn(pmd) + idx; - } -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - else if (is_swap_pmd(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset; - - if (pm->show_pfn) { - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry) + idx; - else - offset = swp_offset(entry) + idx; - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); - } - flags |= PM_SWAP; - if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - if (pmd_swp_uffd_wp(pmd)) - flags |= PM_UFFD_WP; - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - page = pfn_swap_entry_to_page(entry); - } -#endif - - if (page) { - folio = page_folio(page); - if (!folio_test_anon(folio)) - flags |= PM_FILE; - } - - for (; addr != end; addr += PAGE_SIZE, idx++) { - u64 cur_flags = flags; - pagemap_entry_t pme; - - if (folio && (flags & PM_PRESENT) && - __folio_page_mapped_exclusively(folio, page)) - cur_flags |= PM_MMAP_EXCLUSIVE; - - pme = make_pme(frame, cur_flags); - err = add_to_pagemap(&pme, pm); - if (err) - break; - if (pm->show_pfn) { - if (flags & PM_PRESENT) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); - } - } + err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm); spin_unlock(ptl); return err; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* * We can assume that @vma always points to a valid one and @end never From aa62204cb680d8ff32497181fc9e0dac4956f7e5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:25 +0000 Subject: [PATCH 253/321] mm: avoid unnecessary use of is_swap_pmd() PMD 'non-swap' swap entries are currently used for PMD-level migration entries and device private entries. To add to the confusion in this terminology we use is_swap_pmd() in an inconsistent way similar to how is_swap_pte() was being used - sometimes adopting the convention that !pmd_none(), !pmd_present() implies PMD 'swap' entry, sometimes not. This patch handles the low-hanging fruit of cases where we can simply substitute other predicates for is_swap_pmd(). No functional change intended. Link: https://lkml.kernel.org/r/8a1704b36a009c18032d5bea4cb68e71448fbbe5.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 15 ++++++++++--- include/linux/swapops.h | 16 +++++++++++-- mm/huge_memory.c | 4 +++- mm/memory.c | 50 +++++++++++++++++++++++------------------ mm/page_table_check.c | 12 ++++++---- 5 files changed, 65 insertions(+), 32 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8c35ea48a93e..1bedf7fa5e79 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1059,10 +1059,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, bool present = false; struct folio *folio; + if (pmd_none(*pmd)) + return; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; - } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + } else if (unlikely(thp_migration_supported())) { swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_pfn_swap_entry(entry)) @@ -2000,6 +2002,9 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; + if (pmd_none(pmd)) + goto populate_pagemap; + if (pmd_present(pmd)) { page = pmd_page(pmd); @@ -2010,7 +2015,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; - } else if (thp_migration_supported() && is_swap_pmd(pmd)) { + } else if (thp_migration_supported()) { swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned long offset; @@ -2037,6 +2042,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_FILE; } +populate_pagemap: for (; addr != end; addr += PAGE_SIZE, idx++) { u64 cur_flags = flags; pagemap_entry_t pme; @@ -2399,6 +2405,9 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, { unsigned long categories = PAGE_IS_HUGE; + if (pmd_none(pmd)) + return categories; + if (pmd_present(pmd)) { struct page *page; @@ -2416,7 +2425,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pmd(pmd)) { + } else { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a66ac4f2105c..3e8dd6ea94dd 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -509,7 +509,13 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) static inline int is_pmd_migration_entry(pmd_t pmd) { - return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); + swp_entry_t entry; + + if (pmd_present(pmd)) + return 0; + + entry = pmd_to_swp_entry(pmd); + return is_migration_entry(entry); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, @@ -557,7 +563,13 @@ static inline int is_pmd_migration_entry(pmd_t pmd) */ static inline int is_pmd_device_private_entry(pmd_t pmd) { - return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); + swp_entry_t entry; + + if (pmd_present(pmd)) + return 0; + + entry = pmd_to_swp_entry(pmd); + return is_device_private_entry(entry); } #else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79a4bb363de..b88b4b866cb3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2354,9 +2354,11 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) static pmd_t clear_uffd_wp_pmd(pmd_t pmd) { + if (pmd_none(pmd)) + return pmd; if (pmd_present(pmd)) pmd = pmd_clear_uffd_wp(pmd); - else if (is_swap_pmd(pmd)) + else pmd = pmd_swp_clear_uffd_wp(pmd); return pmd; diff --git a/mm/memory.c b/mm/memory.c index 0caf8c5c8c68..76c17feff88b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1376,6 +1376,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { int err; + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); @@ -6340,35 +6341,40 @@ retry_pud: if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) + if (ret & VM_FAULT_FALLBACK) + goto fallback; + else return ret; - } else { - vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + } - if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - if (is_pmd_device_private_entry(vmf.orig_pmd)) - return do_huge_pmd_device_private(&vmf); + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + if (pmd_none(vmf.orig_pmd)) + goto fallback; - if (is_pmd_migration_entry(vmf.orig_pmd)) - pmd_migration_entry_wait(mm, vmf.pmd); + if (unlikely(!pmd_present(vmf.orig_pmd))) { + if (is_pmd_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); + + if (is_pmd_migration_entry(vmf.orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } + if (pmd_trans_huge(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); + + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pmd_set_accessed(&vmf); return 0; } - if (pmd_trans_huge(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf); - - if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !pmd_write(vmf.orig_pmd)) { - ret = wp_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; - } else { - huge_pmd_set_accessed(&vmf); - return 0; - } - } } +fallback: return handle_pte_fault(&vmf); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 43f75d2f7c36..f5f25e120f69 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -215,10 +215,14 @@ EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { - if (pmd_present(pmd) && pmd_uffd_wp(pmd)) - WARN_ON_ONCE(pmd_write(pmd)); - else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) - WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); + if (pmd_present(pmd)) { + if (pmd_uffd_wp(pmd)) + WARN_ON_ONCE(pmd_write(pmd)); + } else if (pmd_swp_uffd_wp(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + WARN_ON_ONCE(swap_cached_writable(entry)); + } } void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, From e244d82d0290340d5ba062f46eff2ede0bd50abe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:26 +0000 Subject: [PATCH 254/321] mm/huge_memory: refactor copy_huge_pmd() non-present logic Right now we are inconsistent in our use of thp_migration_supported(): static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } And simply having arbitrary and ugly #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION blocks in code. This is exhibited in copy_huge_pmd(), which inserts a large #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION block and an if-branch which is difficult to follow It's difficult to follow the logic of such a large function and the non-present PMD logic is clearly separate as it sits in a giant if-branch. Therefore this patch both separates out the logic and utilises thp_migration_supported(). No functional change intended. Link: https://lkml.kernel.org/r/6eaadc23ed512d370ede65561e34e96241c54b9d.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 109 +++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 50 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b88b4b866cb3..3bf8d2bf9374 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1699,6 +1699,62 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache_pmd(vma, addr, pmd); } +static void copy_huge_non_present_pmd( + struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t pmd, pgtable_t pgtable) +{ + swp_entry_t entry = pmd_to_swp_entry(pmd); + struct folio *src_folio; + + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); + + if (is_writable_migration_entry(entry) || + is_readable_exclusive_migration_entry(entry)) { + entry = make_readable_migration_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (is_device_private_entry(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = pfn_swap_entry_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, + dst_vma, src_vma); + } + + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) @@ -1744,59 +1800,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - if (unlikely(is_swap_pmd(pmd))) { - swp_entry_t entry = pmd_to_swp_entry(pmd); - - VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); - - if (is_writable_migration_entry(entry) || - is_readable_exclusive_migration_entry(entry)) { - entry = make_readable_migration_entry(swp_offset(entry)); - pmd = swp_entry_to_pmd(entry); - if (pmd_swp_soft_dirty(*src_pmd)) - pmd = pmd_swp_mksoft_dirty(pmd); - if (pmd_swp_uffd_wp(*src_pmd)) - pmd = pmd_swp_mkuffd_wp(pmd); - set_pmd_at(src_mm, addr, src_pmd, pmd); - } else if (is_device_private_entry(entry)) { - /* - * For device private entries, since there are no - * read exclusive entries, writable = !readable - */ - if (is_writable_device_private_entry(entry)) { - entry = make_readable_device_private_entry(swp_offset(entry)); - pmd = swp_entry_to_pmd(entry); - - if (pmd_swp_soft_dirty(*src_pmd)) - pmd = pmd_swp_mksoft_dirty(pmd); - if (pmd_swp_uffd_wp(*src_pmd)) - pmd = pmd_swp_mkuffd_wp(pmd); - set_pmd_at(src_mm, addr, src_pmd, pmd); - } - - src_folio = pfn_swap_entry_folio(entry); - VM_WARN_ON(!folio_test_large(src_folio)); - - folio_get(src_folio); - /* - * folio_try_dup_anon_rmap_pmd does not fail for - * device private entries. - */ - folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, - dst_vma, src_vma); - } - - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm_inc_nr_ptes(dst_mm); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - if (!userfaultfd_wp(dst_vma)) - pmd = pmd_swp_clear_uffd_wp(pmd); - set_pmd_at(dst_mm, addr, dst_pmd, pmd); + if (unlikely(thp_migration_supported() && is_swap_pmd(pmd))) { + copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, + dst_vma, src_vma, pmd, pgtable); ret = 0; goto out_unlock; } -#endif if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); From 5dfa7916050558b744c81c5d824649ff4e66e7e3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:27 +0000 Subject: [PATCH 255/321] mm/huge_memory: refactor change_huge_pmd() non-present logic Similar to copy_huge_pmd(), there is a large mass of open-coded logic for the CONFIG_ARCH_ENABLE_THP_MIGRATION non-present entry case that does not use thp_migration_supported() consistently. Resolve this by separating out this logic and introduce change_non_present_huge_pmd(). No functional change intended. Link: https://lkml.kernel.org/r/451b85636ad711e307fdfbff19af699fdab4d05f.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 72 ++++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3bf8d2bf9374..0fdb3be39e31 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2424,6 +2424,42 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, return false; } +static void change_non_present_huge_pmd(struct mm_struct *mm, + unsigned long addr, pmd_t *pmd, bool uffd_wp, + bool uffd_wp_resolve) +{ + swp_entry_t entry = pmd_to_swp_entry(*pmd); + struct folio *folio = pfn_swap_entry_folio(entry); + pmd_t newpmd; + + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); + if (is_writable_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + } else { + newpmd = *pmd; + } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); +} + /* * Returns * - 0 if PMD could not be locked @@ -2452,41 +2488,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - if (is_swap_pmd(*pmd)) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = pfn_swap_entry_folio(entry); - pmd_t newpmd; - - VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); - if (is_writable_migration_entry(entry)) { - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry(swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpmd = swp_entry_to_pmd(entry); - if (pmd_swp_soft_dirty(*pmd)) - newpmd = pmd_swp_mksoft_dirty(newpmd); - } else if (is_writable_device_private_entry(entry)) { - entry = make_readable_device_private_entry(swp_offset(entry)); - newpmd = swp_entry_to_pmd(entry); - } else { - newpmd = *pmd; - } - - if (uffd_wp) - newpmd = pmd_swp_mkuffd_wp(newpmd); - else if (uffd_wp_resolve) - newpmd = pmd_swp_clear_uffd_wp(newpmd); - if (!pmd_same(*pmd, newpmd)) - set_pmd_at(mm, addr, pmd, newpmd); + if (thp_migration_supported() && is_swap_pmd(*pmd)) { + change_non_present_huge_pmd(mm, addr, pmd, uffd_wp, + uffd_wp_resolve); goto unlock; } -#endif if (prot_numa) { From 0ac881efe16468503e8c1e7d8a7210b75f027ce3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:28 +0000 Subject: [PATCH 256/321] mm: replace pmd_to_swp_entry() with softleaf_from_pmd() Introduce softleaf_from_pmd() to do the equivalent operation for PMDs that softleaf_from_pte() fulfils, and cascade changes through code base accordingly, introducing helpers as necessary. We are then able to eliminate pmd_to_swp_entry(), is_pmd_migration_entry(), is_pmd_device_private_entry() and is_pmd_non_present_folio_entry(). This further establishes the use of leaf operations throughout the code base and further establishes the foundations for eliminating is_swap_pmd(). No functional change intended. [lorenzo.stoakes@oracle.com: check writable, not readable/writable, per Vlastimil] Link: https://lkml.kernel.org/r/cd97b6ec-00f9-45a4-9ae0-8f009c212a94@lucifer.local Link: https://lkml.kernel.org/r/3fb431699639ded8fdc63d2210aa77a38c8891f1.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: SeongJae Park \ Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 27 +++-- include/linux/leafops.h | 218 +++++++++++++++++++++++++++++++++++++++- include/linux/migrate.h | 2 +- include/linux/swapops.h | 100 ------------------ mm/damon/ops-common.c | 6 +- mm/filemap.c | 6 +- mm/hmm.c | 16 +-- mm/huge_memory.c | 98 +++++++++--------- mm/khugepaged.c | 4 +- mm/madvise.c | 2 +- mm/memory.c | 4 +- mm/mempolicy.c | 4 +- mm/migrate.c | 20 ++-- mm/migrate_device.c | 14 +-- mm/page_table_check.c | 16 +-- mm/page_vma_mapped.c | 15 +-- mm/pagewalk.c | 8 +- mm/rmap.c | 4 +- 18 files changed, 339 insertions(+), 225 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1bedf7fa5e79..898df952b6bc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1065,10 +1065,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, page = vm_normal_page_pmd(vma, addr, *pmd); present = true; } else if (unlikely(thp_migration_supported())) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); + const softleaf_t entry = softleaf_from_pmd(*pmd); - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; @@ -1655,7 +1655,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } @@ -2016,12 +2016,12 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; } else if (thp_migration_supported()) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + const softleaf_t entry = softleaf_from_pmd(pmd); unsigned long offset; if (pm->show_pfn) { - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry) + idx; + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry) + idx; else offset = swp_offset(entry) + idx; frame = swp_type(entry) | @@ -2032,7 +2032,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; - VM_WARN_ON_ONCE(!is_pmd_migration_entry(pmd)); + VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); page = pfn_swap_entry_to_page(entry); } @@ -2426,8 +2426,6 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; } else { - swp_entry_t swp; - categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; @@ -2435,9 +2433,10 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pmd_to_swp_entry(pmd); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; } } @@ -2454,7 +2453,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma, old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } diff --git a/include/linux/leafops.h b/include/linux/leafops.h index cff9d94fd5d1..f5ea9b0385ff 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -61,6 +61,57 @@ static inline softleaf_t softleaf_from_pte(pte_t pte) return pte_to_swp_entry(pte); } +/** + * softleaf_to_pte() - Obtain a PTE entry from a leaf entry. + * @entry: Leaf entry. + * + * This generates an architecture-specific PTE entry that can be utilised to + * encode the metadata the leaf entry encodes. + * + * Returns: Architecture-specific PTE entry encoding leaf entry. + */ +static inline pte_t softleaf_to_pte(softleaf_t entry) +{ + /* Temporary until swp_entry_t eliminated. */ + return swp_entry_to_pte(entry); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry. + * @pmd: PMD entry. + * + * If @pmd is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + softleaf_t arch_entry; + + if (pmd_present(pmd) || pmd_none(pmd)) + return softleaf_mk_none(); + + if (pmd_swp_soft_dirty(pmd)) + pmd = pmd_swp_clear_soft_dirty(pmd); + if (pmd_swp_uffd_wp(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + arch_entry = __pmd_to_swp_entry(pmd); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +#else + +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + return softleaf_mk_none(); +} + +#endif + /** * softleaf_is_none() - Is the leaf entry empty? * @entry: Leaf entry. @@ -134,6 +185,43 @@ static inline bool softleaf_is_swap(softleaf_t entry) return softleaf_type(entry) == SOFTLEAF_SWAP; } +/** + * softleaf_is_migration_write() - Is this leaf entry a writable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a writable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE; +} + +/** + * softleaf_is_migration_read() - Is this leaf entry a readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a readable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_read(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ; +} + +/** + * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive + * readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an exclusive readable migration entry, + * otherwise false. + */ +static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE; +} + /** * softleaf_is_migration() - Is this leaf entry a migration entry? * @entry: Leaf entry. @@ -152,6 +240,19 @@ static inline bool softleaf_is_migration(softleaf_t entry) } } +/** + * softleaf_is_device_private_write() - Is this leaf entry a device private + * writable entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private writable entry, otherwise + * false. + */ +static inline bool softleaf_is_device_private_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE; +} + /** * softleaf_is_device_private() - Is this leaf entry a device private entry? * @entry: Leaf entry. @@ -170,10 +271,10 @@ static inline bool softleaf_is_device_private(softleaf_t entry) } /** - * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry? + * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry? * @entry: Leaf entry. * - * Returns: true if the leaf entry is a device exclusive entry, otherwise false. + * Returns: true if the leaf entry is a device-exclusive entry, otherwise false. */ static inline bool softleaf_is_device_exclusive(softleaf_t entry) { @@ -332,6 +433,61 @@ static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; } +#ifdef CONFIG_MIGRATION + +/** + * softleaf_is_migration_young() - Does this migration entry contain an accessed + * bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the accessed (or 'young') bit was set on the migrated page + * table entry. + * + * Returns: true if the entry contains an accessed bit, otherwise false. + */ +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_YOUNG; + /* Keep the old behavior of aging page after migration */ + return false; +} + +/** + * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the dirty bit was set on the migrated page table entry. + * + * Returns: true if the entry contains a dirty bit, otherwise false. + */ +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_DIRTY; + /* Keep the old behavior of clean page after migration */ + return false; +} + +#else /* CONFIG_MIGRATION */ + +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + return false; +} + +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + return false; +} +#endif /* CONFIG_MIGRATION */ + /** * pte_is_marker() - Does the PTE entry encode a marker leaf entry? * @pte: PTE entry. @@ -383,5 +539,63 @@ static inline bool pte_is_uffd_marker(pte_t pte) return false; } +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * pmd_is_device_private_entry() - Check if PMD contains a device private swap + * entry. + * @pmd: The PMD to check. + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: true if PMD contains device private entry, false otherwise + */ +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return softleaf_is_device_private(softleaf_from_pmd(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return false; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +/** + * pmd_is_migration_entry() - Does this PMD entry encode a migration entry? + * @pmd: PMD entry. + * + * Returns: true if the PMD encodes a migration entry, otherwise false. + */ +static inline bool pmd_is_migration_entry(pmd_t pmd) +{ + return softleaf_is_migration(softleaf_from_pmd(pmd)); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * @pmd: PMD entry. + * + * PMD leaf entries are valid only if they are device private or migration + * entries. This function asserts that a PMD leaf entry is valid in this + * respect. + * + * Returns: true if the PMD entry is a valid leaf entry, otherwise false. + */ +static inline bool pmd_is_valid_softleaf(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 41b4cc05a450..26ca00c325d9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3e8dd6ea94dd..f1277647262d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -283,14 +283,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_YOUNG; - /* Keep the old behavior of aging page after migration */ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) @@ -299,14 +291,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_DIRTY; - /* Keep the old behavior of clean page after migration */ - return false; -} - extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); @@ -349,20 +333,11 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE @@ -487,18 +462,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - swp_entry_t arch_entry; - - if (pmd_swp_soft_dirty(pmd)) - pmd = pmd_swp_clear_soft_dirty(pmd); - if (pmd_swp_uffd_wp(pmd)) - pmd = pmd_swp_clear_uffd_wp(pmd); - arch_entry = __pmd_to_swp_entry(pmd); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; @@ -507,23 +470,7 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) return __swp_entry_to_pmd(arch_entry); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - swp_entry_t entry; - - if (pmd_present(pmd)) - return 0; - - entry = pmd_to_swp_entry(pmd); - return is_migration_entry(entry); -} #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, - struct page *page) -{ - BUILD_BUG(); -} - static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { @@ -532,64 +479,17 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - return swp_entry(0, 0); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return 0; -} #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) - -/** - * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry - * @pmd: The PMD to check - * - * Returns true if the PMD contains a swap entry that represents a device private - * page mapping. This is used for zone device private pages that have been - * swapped out but still need special handling during various memory management - * operations. - * - * Return: 1 if PMD contains device private entry, 0 otherwise - */ -static inline int is_pmd_device_private_entry(pmd_t pmd) -{ - swp_entry_t entry; - - if (pmd_present(pmd)) - return 0; - - entry = pmd_to_swp_entry(pmd); - return is_device_private_entry(entry); -} - -#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ - -static inline int is_pmd_device_private_entry(pmd_t pmd) -{ - return 0; -} - -#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } -static inline int is_pmd_non_present_folio_entry(pmd_t pmd) -{ - return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 971df8a16ba4..a218d9922234 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "../internal.h" #include "ops-common.h" @@ -51,7 +51,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr if (likely(pte_present(pteval))) pfn = pte_pfn(pteval); else - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(softleaf_from_pte(pteval)); folio = damon_get_folio(pfn); if (!folio) @@ -83,7 +83,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr if (likely(pmd_present(pmdval))) pfn = pmd_pfn(pmdval); else - pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); + pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); folio = damon_get_folio(pfn); if (!folio) diff --git a/mm/filemap.c b/mm/filemap.c index f0c36df1def7..02355aa46324 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -1402,7 +1402,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1411,7 +1411,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = pfn_swap_entry_folio(entry); + struct folio *folio = softleaf_to_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { diff --git a/mm/hmm.c b/mm/hmm.c index e350d0cc9d41..e9735a9b6102 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -339,19 +339,19 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long npages = (end - start) >> PAGE_SHIFT; + const softleaf_t entry = softleaf_from_pmd(pmd); unsigned long addr = start; - swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned int required_fault; - if (is_device_private_entry(entry) && - pfn_swap_entry_folio(entry)->pgmap->owner == + if (softleaf_is_device_private(entry) && + softleaf_to_folio(entry)->pgmap->owner == range->dev_private_owner) { unsigned long cpu_flags = HMM_PFN_VALID | hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); - unsigned long pfn = swp_offset_pfn(entry); + unsigned long pfn = softleaf_to_pfn(entry); unsigned long i; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; /* @@ -370,7 +370,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); if (required_fault) { - if (is_device_private_entry(entry)) + if (softleaf_is_device_private(entry)) return hmm_vma_fault(addr, end, required_fault, walk); else return -EFAULT; @@ -412,7 +412,7 @@ again: if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, -1, walk); - if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + if (thp_migration_supported() && pmd_is_migration_entry(pmd)) { if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0fdb3be39e31..9aa933723355 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1299,7 +1299,7 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; spinlock_t *ptl; - swp_entry_t swp_entry; + softleaf_t entry; struct page *page; struct folio *folio; @@ -1314,8 +1314,8 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) return 0; } - swp_entry = pmd_to_swp_entry(vmf->orig_pmd); - page = pfn_swap_entry_to_page(swp_entry); + entry = softleaf_from_pmd(vmf->orig_pmd); + page = softleaf_to_page(entry); folio = page_folio(page); vmf->page = page; vmf->pte = NULL; @@ -1705,13 +1705,13 @@ static void copy_huge_non_present_pmd( struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t pmd, pgtable_t pgtable) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + softleaf_t entry = softleaf_from_pmd(pmd); struct folio *src_folio; - VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd)); - if (is_writable_migration_entry(entry) || - is_readable_exclusive_migration_entry(entry)) { + if (softleaf_is_migration_write(entry) || + softleaf_is_migration_read_exclusive(entry)) { entry = make_readable_migration_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) @@ -1719,12 +1719,12 @@ static void copy_huge_non_present_pmd( if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { /* * For device private entries, since there are no * read exclusive entries, writable = !readable */ - if (is_writable_device_private_entry(entry)) { + if (softleaf_is_device_private_write(entry)) { entry = make_readable_device_private_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); @@ -1735,7 +1735,7 @@ static void copy_huge_non_present_pmd( set_pmd_at(src_mm, addr, src_pmd, pmd); } - src_folio = pfn_swap_entry_folio(entry); + src_folio = softleaf_to_folio(entry); VM_WARN_ON(!folio_test_large(src_folio)); folio_get(src_folio); @@ -2195,7 +2195,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto out; } @@ -2293,11 +2293,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (is_pmd_non_present_folio_entry(orig_pmd)) { - swp_entry_t entry; + } else if (pmd_is_valid_softleaf(orig_pmd)) { + const softleaf_t entry = softleaf_from_pmd(orig_pmd); - entry = pmd_to_swp_entry(orig_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(entry); flush_needed = 0; if (!thp_migration_supported()) @@ -2353,7 +2352,7 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, static pmd_t move_soft_dirty_pmd(pmd_t pmd) { #ifdef CONFIG_MEM_SOFT_DIRTY - if (unlikely(is_pmd_migration_entry(pmd))) + if (unlikely(pmd_is_migration_entry(pmd))) pmd = pmd_swp_mksoft_dirty(pmd); else if (pmd_present(pmd)) pmd = pmd_mksoft_dirty(pmd); @@ -2428,12 +2427,12 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, unsigned long addr, pmd_t *pmd, bool uffd_wp, bool uffd_wp_resolve) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = pfn_swap_entry_folio(entry); + softleaf_t entry = softleaf_from_pmd(*pmd); + const struct folio *folio = softleaf_to_folio(entry); pmd_t newpmd; - VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); - if (is_writable_migration_entry(entry)) { + VM_WARN_ON(!pmd_is_valid_softleaf(*pmd)); + if (softleaf_is_migration_write(entry)) { /* * A protection check is difficult so * just be safe and disable write @@ -2445,7 +2444,7 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); - } else if (is_writable_device_private_entry(entry)) { + } else if (softleaf_is_device_private_write(entry)) { entry = make_readable_device_private_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); } else { @@ -2643,7 +2642,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm if (!pmd_trans_huge(src_pmdval)) { spin_unlock(src_ptl); - if (is_pmd_migration_entry(src_pmdval)) { + if (pmd_is_migration_entry(src_pmdval)) { pmd_migration_entry_wait(mm, &src_pmdval); return -EAGAIN; } @@ -2908,13 +2907,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr; pte_t *pte; int i; - swp_entry_t entry; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2928,11 +2926,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (!vma_is_dax(vma) && vma_is_special_huge(vma)) return; - if (unlikely(is_pmd_migration_entry(old_pmd))) { - swp_entry_t entry; + if (unlikely(pmd_is_migration_entry(old_pmd))) { + const softleaf_t old_entry = softleaf_from_pmd(old_pmd); - entry = pmd_to_swp_entry(old_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(old_entry); } else if (is_huge_zero_pmd(old_pmd)) { return; } else { @@ -2962,31 +2959,34 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + if (pmd_is_migration_entry(*pmd)) { + softleaf_t entry; - if (is_pmd_migration_entry(*pmd)) { old_pmd = *pmd; - entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); folio = page_folio(page); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); - write = is_writable_migration_entry(entry); + write = softleaf_is_migration_write(entry); if (PageAnon(page)) - anon_exclusive = is_readable_exclusive_migration_entry(entry); - young = is_migration_entry_young(entry); - dirty = is_migration_entry_dirty(entry); - } else if (is_pmd_device_private_entry(*pmd)) { + anon_exclusive = softleaf_is_migration_read_exclusive(entry); + young = softleaf_is_migration_young(entry); + dirty = softleaf_is_migration_dirty(entry); + } else if (pmd_is_device_private_entry(*pmd)) { + softleaf_t entry; + old_pmd = *pmd; - entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); folio = page_folio(page); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); - write = is_writable_device_private_entry(entry); + write = softleaf_is_device_private_write(entry); anon_exclusive = PageAnonExclusive(page); /* @@ -3090,7 +3090,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs. */ - if (freeze || is_pmd_migration_entry(old_pmd)) { + if (freeze || pmd_is_migration_entry(old_pmd)) { pte_t entry; swp_entry_t swp_entry; @@ -3116,7 +3116,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); } - } else if (is_pmd_device_private_entry(old_pmd)) { + } else if (pmd_is_device_private_entry(old_pmd)) { pte_t entry; swp_entry_t swp_entry; @@ -3166,7 +3166,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap(pte); - if (!is_pmd_migration_entry(*pmd)) + if (!pmd_is_migration_entry(*pmd)) folio_remove_rmap_pmd(folio, page, vma); if (freeze) put_page(page); @@ -3179,7 +3179,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze) { VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) + if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); } @@ -4749,25 +4749,25 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) unsigned long address = pvmw->address; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; - swp_entry_t entry; + softleaf_t entry; if (!(pvmw->pmd && !pvmw->pte)) return; - entry = pmd_to_swp_entry(*pvmw->pmd); + entry = softleaf_from_pmd(*pvmw->pmd); folio_get(folio); pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); - if (!is_migration_entry_young(entry)) + if (!softleaf_is_migration_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pmde = pmd_mkdirty(pmde); if (folio_is_device_private(folio)) { @@ -4790,7 +4790,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (folio_test_anon(folio)) { rmap_t rmap_flags = RMAP_NONE; - if (!is_readable_migration_entry(entry)) + if (!softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7e71c2e2571..7e8cb181d5bd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -941,7 +941,7 @@ static inline int check_pmd_state(pmd_t *pmd) * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ - if (is_pmd_migration_entry(pmde)) + if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_PMD_NULL; diff --git a/mm/madvise.c b/mm/madvise.c index 2d7dd7901bae..5979a4a39738 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -390,7 +390,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto huge_unlock; } diff --git a/mm/memory.c b/mm/memory.c index 76c17feff88b..9d0d527e95a8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6352,10 +6352,10 @@ retry_pud: goto fallback; if (unlikely(!pmd_present(vmf.orig_pmd))) { - if (is_pmd_device_private_entry(vmf.orig_pmd)) + if (pmd_is_device_private_entry(vmf.orig_pmd)) return do_huge_pmd_device_private(&vmf); - if (is_pmd_migration_entry(vmf.orig_pmd)) + if (pmd_is_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7ae3f5e2dee6..01c3b98f87a6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -110,7 +110,7 @@ #include #include #include -#include +#include #include #include @@ -647,7 +647,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) struct folio *folio; struct queue_pages *qp = walk->private; - if (unlikely(is_pmd_migration_entry(*pmd))) { + if (unlikely(pmd_is_migration_entry(*pmd))) { qp->nr_failed++; return; } diff --git a/mm/migrate.c b/mm/migrate.c index 847c1ec17628..ca4ec170a89b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -353,7 +353,7 @@ static bool remove_migration_pte(struct folio *folio, rmap_t rmap_flags = RMAP_NONE; pte_t old_pte; pte_t pte; - swp_entry_t entry; + softleaf_t entry; struct page *new; unsigned long idx = 0; @@ -379,22 +379,22 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - entry = pte_to_swp_entry(old_pte); - if (!is_migration_entry_young(entry)) + entry = softleaf_from_pte(old_pte); + if (!softleaf_is_migration_young(entry)) pte = pte_mkold(pte); - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pte = pte_mkdirty(pte); if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); else pte = pte_clear_soft_dirty(pte); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); - if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) + if (folio_test_anon(folio) && !softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; if (unlikely(is_device_private_page(new))) { @@ -404,7 +404,7 @@ static bool remove_migration_pte(struct folio *folio, else entry = make_readable_device_private_entry( page_to_pfn(new)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(old_pte)) @@ -543,9 +543,9 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - if (!is_pmd_migration_entry(*pmd)) + if (!pmd_is_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); + migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); return; unlock: spin_unlock(ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index ab373fd38961..592b4561507c 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include "internal.h" @@ -141,7 +141,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, struct folio *folio; struct migrate_vma *migrate = walk->private; spinlock_t *ptl; - swp_entry_t entry; int ret; unsigned long write = 0; @@ -165,23 +164,24 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, if (pmd_write(*pmdp)) write = MIGRATE_PFN_WRITE; } else if (!pmd_present(*pmdp)) { - entry = pmd_to_swp_entry(*pmdp); - folio = pfn_swap_entry_folio(entry); + const softleaf_t entry = softleaf_from_pmd(*pmdp); - if (!is_device_private_entry(entry) || + folio = softleaf_to_folio(entry); + + if (!softleaf_is_device_private(entry) || !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || (folio->pgmap->owner != migrate->pgmap_owner)) { spin_unlock(ptl); return migrate_vma_collect_skip(start, end, walk); } - if (is_migration_entry(entry)) { + if (softleaf_is_migration(entry)) { migration_entry_wait_on_locked(entry, ptl); spin_unlock(ptl); return -EAGAIN; } - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) write = MIGRATE_PFN_WRITE; } else { spin_unlock(ptl); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index f5f25e120f69..741884645ab0 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt @@ -179,10 +179,10 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ -static inline bool swap_cached_writable(swp_entry_t entry) +static inline bool softleaf_cached_writable(softleaf_t entry) { - return is_writable_device_private_entry(entry) || - is_writable_migration_entry(entry); + return softleaf_is_device_private_write(entry) || + softleaf_is_migration_write(entry); } static void page_table_check_pte_flags(pte_t pte) @@ -190,9 +190,9 @@ static void page_table_check_pte_flags(pte_t pte) if (pte_present(pte)) { WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte)); } else if (pte_swp_uffd_wp(pte)) { - const swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - WARN_ON_ONCE(swap_cached_writable(entry)); + WARN_ON_ONCE(softleaf_cached_writable(entry)); } } @@ -219,9 +219,9 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) if (pmd_uffd_wp(pmd)) WARN_ON_ONCE(pmd_write(pmd)); } else if (pmd_swp_uffd_wp(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + const softleaf_t entry = softleaf_from_pmd(pmd); - WARN_ON_ONCE(swap_cached_writable(entry)); + WARN_ON_ONCE(softleaf_cached_writable(entry)); } } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a4e23818f37f..8137d2366722 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -242,18 +242,19 @@ restart: */ pmde = pmdp_get_lockless(pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + if (pmd_trans_huge(pmde) || pmd_is_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); pmde = *pvmw->pmd; if (!pmd_present(pmde)) { - swp_entry_t entry; + softleaf_t entry; if (!thp_migration_supported() || !(pvmw->flags & PVMW_MIGRATION)) return not_found(pvmw); - entry = pmd_to_swp_entry(pmde); - if (!is_migration_entry(entry) || - !check_pmd(swp_offset_pfn(entry), pvmw)) + entry = softleaf_from_pmd(pmde); + + if (!softleaf_is_migration(entry) || + !check_pmd(softleaf_to_pfn(entry), pvmw)) return not_found(pvmw); return true; } @@ -273,9 +274,9 @@ restart: * cannot return prematurely, while zap_huge_pmd() has * cleared *pmd but not decremented compound_mapcount(). */ - swp_entry_t entry = pmd_to_swp_entry(pmde); + const softleaf_t entry = softleaf_from_pmd(pmde); - if (is_device_private_entry(entry)) { + if (softleaf_is_device_private(entry)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); return true; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8a29b7237bc6..378c774795fc 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include @@ -973,10 +973,10 @@ pmd_table: goto found; } } else if ((flags & FW_MIGRATION) && - is_pmd_migration_entry(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + pmd_is_migration_entry(pmd)) { + const softleaf_t entry = softleaf_from_pmd(pmd); - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); expose_page = false; goto found; } diff --git a/mm/rmap.c b/mm/rmap.c index 1954c538a991..775710115a41 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include @@ -2341,7 +2341,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pmd_present(pmdval))) pfn = pmd_pfn(pmdval); else - pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); + pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); subpage = folio_page(folio, pfn - folio_pfn(folio)); From 15eabc898dc58c9e97eb9ddd56dc6b893e7d0d0e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:29 +0000 Subject: [PATCH 257/321] mm: introduce pmd_is_huge() and use where appropriate The leaf entry PMD case is confusing as only migration entries and device private entries are valid at PMD level, not true swap entries. We repeatedly perform checks of the form is_swap_pmd() || pmd_trans_huge() which is itself confusing - it implies that leaf entries at PMD level exist and are different from huge entries. Address this confusion by introduced pmd_is_huge() which checks for either case. Sadly due to header dependency issues (huge_mm.h is included very early on in headers and cannot really rely on much else) we cannot use pmd_is_valid_softleaf() here. However since these are the only valid, handled cases the function is still achieving what it intends to do. We then replace all instances of is_swap_pmd() || pmd_trans_huge() with pmd_is_huge() invocations and adjust logic accordingly to accommodate this. No functional change intended. Link: https://lkml.kernel.org/r/00f79db3b15293cac8f7040a48d69c52d00117e4.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 39 +++++++++++++++++++++++++++++++++++---- include/linux/swapops.h | 6 ++++++ mm/huge_memory.c | 3 ++- mm/memory.c | 4 ++-- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- 6 files changed, 47 insertions(+), 9 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 19d4a5f52ca2..5ab240d61dcc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -419,10 +419,36 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); +/** + * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry? + * @pmd: The PMD to check. + * + * A huge PMD entry is a non-empty entry which is present and marked huge or a + * software leaf entry. This check be performed without the appropriate locks + * held, in which case the condition should be rechecked after they are + * acquired. + * + * Returns: true if this PMD is huge, false otherwise. + */ +static inline bool pmd_is_huge(pmd_t pmd) +{ + if (pmd_present(pmd)) { + return pmd_trans_huge(pmd); + } else if (!pmd_none(pmd)) { + /* + * Non-present PMDs must be valid huge non-present entries. We + * cannot assert that here due to header dependency issues. + */ + return true; + } + + return false; +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \ + if (pmd_is_huge(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false); \ } while (0) @@ -469,10 +495,10 @@ static inline int is_swap_pmd(pmd_t pmd) static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_is_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma); - else - return NULL; + + return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) @@ -743,6 +769,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void) { return NULL; } + +static inline bool pmd_is_huge(pmd_t pmd) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f1277647262d..41cfc6d59054 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -471,6 +471,12 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + BUILD_BUG(); +} + static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9aa933723355..71dc6e41f0c8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2735,8 +2735,9 @@ unlock_ptls: spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) + if (likely(pmd_is_huge(*pmd))) return ptl; spin_unlock(ptl); return NULL; diff --git a/mm/memory.c b/mm/memory.c index 9d0d527e95a8..95dac6a1cbc4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1374,7 +1374,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { + if (pmd_is_huge(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); @@ -1917,7 +1917,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) { + if (pmd_is_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index aa555dfbdfc5..f910cbf41442 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -474,7 +474,7 @@ again: goto next; _pmd = pmdp_get_lockless(pmd); - if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) { + if (pmd_is_huge(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false); diff --git a/mm/mremap.c b/mm/mremap.c index 62b6827abacf..fdb0485ede74 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -850,7 +850,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) if (!new_pmd) break; again: - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { + if (pmd_is_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; From c0a80c2ce68d3a04daa52497fbf524ffb3a376e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:30 +0000 Subject: [PATCH 258/321] mm: remove remaining is_swap_pmd() users and is_swap_pmd() Update copy_huge_pmd() and change_huge_pmd() to use pmd_is_valid_softleaf() - as this checks for the only valid non-present huge PMD states. Also update mm/debug_vm_pgtable.c to explicitly test for a valid leaf PMD entry (which it was not before, which was incorrect), and have it test against pmd_is_huge() and pmd_is_valid_softleaf() rather than is_swap_pmd(). With these changes done there are no further users of is_swap_pmd(), so remove it. Link: https://lkml.kernel.org/r/1628b00b00c8498bbd2c20b82117ee87845fb738.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 --------- mm/debug_vm_pgtable.c | 25 +++++++++++++++---------- mm/huge_memory.c | 5 +++-- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5ab240d61dcc..525624c285a6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -486,11 +486,6 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); -static inline int is_swap_pmd(pmd_t pmd) -{ - return !pmd_none(pmd) && !pmd_present(pmd); -} - /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) @@ -692,10 +687,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, struct vm_area_struct *next) { } -static inline int is_swap_pmd(pmd_t pmd) -{ - return 0; -} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index fff311830959..608d1011ce03 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -74,6 +74,7 @@ struct pgtable_debug_args { unsigned long fixed_pte_pfn; swp_entry_t swp_entry; + swp_entry_t leaf_entry; }; static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) @@ -745,7 +746,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } -static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) +static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -757,15 +758,16 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = swp_entry_to_pmd(args->swp_entry); - WARN_ON(!is_swap_pmd(pmd)); + pmd = swp_entry_to_pmd(args->leaf_entry); + WARN_ON(!pmd_is_huge(pmd)); + WARN_ON(!pmd_is_valid_softleaf(pmd)); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } -static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) @@ -818,7 +820,7 @@ static void __init pte_swap_tests(struct pgtable_debug_args *args) } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -static void __init pmd_swap_tests(struct pgtable_debug_args *args) +static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { swp_entry_t arch_entry; pmd_t pmd1, pmd2; @@ -827,15 +829,16 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap\n"); - pmd1 = swp_entry_to_pmd(args->swp_entry); - WARN_ON(!is_swap_pmd(pmd1)); + pmd1 = swp_entry_to_pmd(args->leaf_entry); + WARN_ON(!pmd_is_huge(pmd1)); + WARN_ON(!pmd_is_valid_softleaf(pmd1)); arch_entry = __pmd_to_swp_entry(pmd1); pmd2 = __swp_entry_to_pmd(arch_entry); WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1))); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init swap_migration_tests(struct pgtable_debug_args *args) @@ -1229,6 +1232,8 @@ static int __init init_args(struct pgtable_debug_args *args) max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); /* Create a swp entry with all possible bits set while still being swap. */ args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); + /* Create a non-present migration entry. */ + args->leaf_entry = make_writable_migration_entry(~0UL); /* * Allocate (huge) pages because some of the tests need to access @@ -1318,12 +1323,12 @@ static int __init debug_vm_pgtable(void) pte_soft_dirty_tests(&args); pmd_soft_dirty_tests(&args); pte_swap_soft_dirty_tests(&args); - pmd_swap_soft_dirty_tests(&args); + pmd_leaf_soft_dirty_tests(&args); pte_swap_exclusive_tests(&args); pte_swap_tests(&args); - pmd_swap_tests(&args); + pmd_softleaf_tests(&args); swap_migration_tests(&args); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 71dc6e41f0c8..e38b0d5e3102 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1800,7 +1800,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(thp_migration_supported() && is_swap_pmd(pmd))) { + if (unlikely(thp_migration_supported() && + pmd_is_valid_softleaf(pmd))) { copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma, pmd, pgtable); ret = 0; @@ -2487,7 +2488,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - if (thp_migration_supported() && is_swap_pmd(*pmd)) { + if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) { change_non_present_huge_pmd(mm, addr, pmd, uffd_wp, uffd_wp_resolve); goto unlock; From 9ff30bb9ab40b34908eefd661f12f99aa00d04c3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:31 +0000 Subject: [PATCH 259/321] mm: remove non_swap_entry() and use softleaf helpers instead There is simply no need for the hugely confusing concept of 'non-swap' swap entries now we have the concept of softleaf entries and relevant softleaf_xxx() helpers. Adjust all callers to use these instead and remove non_swap_entry() altogether. No functional change intended. Link: https://lkml.kernel.org/r/2562093f37f4a9cffea0447058014485eb50aaaf.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 20 ++++++++++---------- arch/s390/mm/pgtable.c | 12 ++++++------ fs/proc/task_mmu.c | 12 ++++++------ include/linux/swapops.h | 5 ----- mm/filemap.c | 2 +- mm/hmm.c | 16 ++++++++-------- mm/madvise.c | 2 +- mm/memory.c | 36 ++++++++++++++++++------------------ mm/mincore.c | 2 +- mm/userfaultfd.c | 24 ++++++++++++------------ 10 files changed, 63 insertions(+), 68 deletions(-) diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index d4c3c36855e2..549f14ad08af 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -11,27 +11,27 @@ #include #include #include -#include +#include #include #include #include #include /** - * ptep_zap_swap_entry() - discard a swap entry. + * ptep_zap_softleaf_entry() - discard a software leaf entry. * @mm: the mm - * @entry: the swap entry that needs to be zapped + * @entry: the software leaf entry that needs to be zapped * - * Discards the given swap entry. If the swap entry was an actual swap - * entry (and not a migration entry, for example), the actual swapped + * Discards the given software leaf entry. If the leaf entry was an actual + * swap entry (and not a migration entry, for example), the actual swapped * page is also discarded from swap. */ -static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) { - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) - dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); + else if (softleaf_is_migration(entry)) + dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); free_swap_and_cache(entry); } @@ -66,7 +66,7 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) preempt_disable(); pgste = pgste_get_lock(ptep); - ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); pte_clear(mm, vmaddr, ptep); pgste_set_unlock(ptep, pgste); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0fde20bbc50b..d670bfb47d9b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -683,12 +683,12 @@ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) pgste_set_unlock(ptep, pgste); } -static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) { - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); dec_mm_counter(mm, mm_counter(folio)); } @@ -710,7 +710,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, if (!reset && pte_swap(pte) && ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || (pgstev & _PGSTE_GPS_ZERO))) { - ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte)); pte_clear(mm, addr, ptep); } if (reset) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 898df952b6bc..1f49c81b3591 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1020,13 +1020,13 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (pte_none(ptent)) { smaps_pte_hole_lookup(addr, walk); } else { - swp_entry_t swpent = pte_to_swp_entry(ptent); + const softleaf_t entry = softleaf_from_pte(ptent); - if (!non_swap_entry(swpent)) { + if (softleaf_is_swap(entry)) { int mapcount; mss->swap += PAGE_SIZE; - mapcount = swp_swapcount(swpent); + mapcount = swp_swapcount(entry); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; @@ -1035,10 +1035,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) { - if (is_device_private_entry(swpent)) + } else if (softleaf_has_pfn(entry)) { + if (softleaf_is_device_private(entry)) present = true; - page = pfn_swap_entry_to_page(swpent); + page = softleaf_to_page(entry); } } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 41cfc6d59054..c8e6f927da48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -492,10 +492,5 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int non_swap_entry(swp_entry_t entry) -{ - return swp_type(entry) >= MAX_SWAPFILES; -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/filemap.c b/mm/filemap.c index 02355aa46324..07634b7d9934 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4567,7 +4567,7 @@ static void filemap_cachestat(struct address_space *mapping, swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ - if (non_swap_entry(swp)) + if (!softleaf_is_swap(swp)) goto resched; /* diff --git a/mm/hmm.c b/mm/hmm.c index e9735a9b6102..0158f2d1e027 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -258,17 +258,17 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); /* * Don't fault in device private pages owned by the caller, * just report the PFN. */ - if (is_device_private_entry(entry) && - page_pgmap(pfn_swap_entry_to_page(entry))->owner == + if (softleaf_is_device_private(entry) && + page_pgmap(softleaf_to_page(entry))->owner == range->dev_private_owner) { cpu_flags = HMM_PFN_VALID; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; goto out; @@ -279,16 +279,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (!required_fault) goto out; - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) goto fault; - if (is_device_private_entry(entry)) + if (softleaf_is_device_private(entry)) goto fault; - if (is_device_exclusive_entry(entry)) + if (softleaf_is_device_exclusive(entry)) goto fault; - if (is_migration_entry(entry)) { + if (softleaf_is_migration(entry)) { pte_unmap(ptep); hmm_vma_walk->last = addr; migration_entry_wait(walk->mm, pmdp, addr); diff --git a/mm/madvise.c b/mm/madvise.c index 5979a4a39738..d8bc51e1bea7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -249,7 +249,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma, continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(entry)) + if (!softleaf_is_swap(entry)) continue; addr = vma->vm_start + diff --git a/mm/memory.c b/mm/memory.c index 95dac6a1cbc4..a3f001a47ecf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -932,7 +932,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct folio *folio; struct page *page; - if (likely(!non_swap_entry(entry))) { + if (likely(softleaf_is_swap(entry))) { if (swap_duplicate(entry) < 0) return -EIO; @@ -950,12 +950,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + folio = softleaf_to_folio(entry); rss[mm_counter(folio)]++; - if (!is_readable_migration_entry(entry) && + if (!softleaf_is_migration_read(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child @@ -964,15 +964,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ entry = make_readable_migration_entry( swp_offset(entry)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_private_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_private(entry)) { + page = softleaf_to_page(entry); folio = page_folio(page); /* @@ -996,7 +996,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_writable_device_private_entry(entry) && + if (softleaf_is_device_private_write(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); @@ -1005,7 +1005,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_exclusive_entry(entry)) { + } else if (softleaf_is_device_exclusive(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device @@ -4625,7 +4625,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; - swp_entry_t entry; + softleaf_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; @@ -4637,15 +4637,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!pte_unmap_same(vmf)) goto out; - entry = pte_to_swp_entry(vmf->orig_pte); - if (unlikely(non_swap_entry(entry))) { - if (is_migration_entry(entry)) { + entry = softleaf_from_pte(vmf->orig_pte); + if (unlikely(!softleaf_is_swap(entry))) { + if (softleaf_is_migration(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); - } else if (is_device_exclusive_entry(entry)) { - vmf->page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_exclusive(entry)) { + vmf->page = softleaf_to_page(entry); ret = remove_device_exclusive_entry(vmf); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate @@ -4656,7 +4656,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; } - vmf->page = pfn_swap_entry_to_page(entry); + vmf->page = softleaf_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || @@ -4680,7 +4680,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } else { pte_unmap_unlock(vmf->pte, vmf->ptl); } - } else if (is_hwpoison_entry(entry)) { + } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); diff --git a/mm/mincore.c b/mm/mincore.c index b3682488a65d..9a908d8bb706 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -74,7 +74,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem) * absent. Page table may contain migration or hwpoison * entries which are always uptodate. */ - if (non_swap_entry(entry)) + if (!softleaf_is_swap(entry)) return !shmem; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 055ec1050776..bd1f74a7a5ac 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1256,7 +1256,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd unsigned long dst_addr, unsigned long src_addr, unsigned long len, __u64 mode) { - swp_entry_t entry; struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; @@ -1430,19 +1429,20 @@ retry: orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, len); - } else { + } else { /* !pte_present() */ struct folio *folio = NULL; + const softleaf_t entry = softleaf_from_pte(orig_src_pte); - entry = pte_to_swp_entry(orig_src_pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - migration_entry_wait(mm, src_pmd, src_addr); - ret = -EAGAIN; - } else - ret = -EFAULT; + if (softleaf_is_migration(entry)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + + ret = -EAGAIN; + goto out; + } else if (!softleaf_is_swap(entry)) { + ret = -EFAULT; goto out; } From 03bfbc3ad6e496fb576ca9ace08211943232fdf9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:32 +0000 Subject: [PATCH 260/321] mm: remove is_hugetlb_entry_[migration, hwpoisoned]() We do not need to have explicit helper functions for these, it adds a level of confusion and indirection when we can simply use software leaf entry logic here instead and spell out the special huge_pte_none() case we must consider. No functional change intended. Link: https://lkml.kernel.org/r/0e92d6924d3de88cd014ce1c53e20edc08fc152e.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 27 ++++++------ include/linux/hugetlb.h | 2 - mm/hugetlb.c | 91 +++++++++++++++++------------------------ mm/mempolicy.c | 17 +++++--- mm/migrate.c | 15 +++++-- 5 files changed, 73 insertions(+), 79 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1f49c81b3591..92ada14eabc0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2500,22 +2500,23 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { - unsigned long psize; + const unsigned long psize = huge_page_size(hstate_vma(vma)); + softleaf_t entry; - if (is_hugetlb_entry_hwpoisoned(ptent) || pte_is_marker(ptent)) - return; - - psize = huge_page_size(hstate_vma(vma)); - - if (is_hugetlb_entry_migration(ptent)) - set_huge_pte_at(vma->vm_mm, addr, ptep, - pte_swp_mkuffd_wp(ptent), psize); - else if (!huge_pte_none(ptent)) - huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, - huge_pte_mkuffd_wp(ptent)); - else + if (huge_pte_none(ptent)) set_huge_pte_at(vma->vm_mm, addr, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); + + entry = softleaf_from_pte(ptent); + if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) + return; + + if (softleaf_is_migration(entry)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2387513d6ae5..457d48ac7bcd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -274,8 +274,6 @@ void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -bool is_hugetlb_entry_migration(pte_t pte); -bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59d91c36770c..311c5d601310 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4846,32 +4846,6 @@ static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, set_huge_ptep_writable(vma, address, ptep); } -bool is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_migration_entry(swp)) - return true; - else - return false; -} - -bool is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_hwpoison_entry(swp)) - return true; - else - return false; -} - static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) @@ -4900,6 +4874,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; + softleaf_t softleaf; int ret = 0; if (cow) { @@ -4947,16 +4922,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { - /* - * Skip if src entry none. - */ - ; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + /* Skip if src entry none. */ + goto next; + } + + softleaf = softleaf_from_pte(entry); + if (unlikely(softleaf_is_hwpoison(softleaf))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_hugetlb_entry_migration(entry))) { - softleaf_t softleaf = softleaf_from_pte(entry); + } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); if (!is_readable_migration_entry(softleaf) && cow) { @@ -4975,7 +4950,6 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(pte_is_marker(entry))) { - const softleaf_t softleaf = softleaf_from_pte(entry); const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) @@ -5033,9 +5007,7 @@ again: } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - continue; + goto next; } if (cow) { @@ -5056,6 +5028,8 @@ again: set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } + +next: spin_unlock(src_ptl); spin_unlock(dst_ptl); } @@ -6064,8 +6038,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; /* Not present, either a migration or a hwpoisoned entry */ - if (!pte_present(vmf.orig_pte)) { - if (is_hugetlb_entry_migration(vmf.orig_pte)) { + if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) { + const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte); + + if (softleaf_is_migration(softleaf)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6076,9 +6052,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) + } + if (softleaf_is_hwpoison(softleaf)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } + goto out_mutex; } @@ -6460,7 +6439,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { + softleaf_t entry; spinlock_t *ptl; + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { @@ -6492,15 +6473,23 @@ long hugetlb_change_protection(struct vm_area_struct *vma, continue; } pte = huge_ptep_get(mm, address, ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { - /* Nothing to do. */ - } else if (unlikely(is_hugetlb_entry_migration(pte))) { - softleaf_t entry = softleaf_from_pte(pte); + if (huge_pte_none(pte)) { + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), + psize); + goto next; + } + entry = softleaf_from_pte(pte); + if (unlikely(softleaf_is_hwpoison(entry))) { + /* Nothing to do. */ + } else if (unlikely(softleaf_is_migration(entry))) { struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; - if (is_writable_migration_entry(entry)) { + if (softleaf_is_migration_write(entry)) { if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); @@ -6527,7 +6516,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); - } else if (!huge_pte_none(pte)) { + } else { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@ -6540,16 +6529,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; - } else { - /* None pte */ - if (unlikely(uffd_wp)) - /* Safe to modify directly (none->non-present). */ - set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP), - psize); } - spin_unlock(ptl); +next: + spin_unlock(ptl); cond_resched(); } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01c3b98f87a6..dee95d5ecfd4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -768,16 +768,21 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; - pte_t entry; + pte_t ptep; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); - entry = huge_ptep_get(walk->mm, addr, pte); - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) - qp->nr_failed++; + ptep = huge_ptep_get(walk->mm, addr, pte); + if (!pte_present(ptep)) { + if (!huge_pte_none(ptep)) { + const softleaf_t entry = softleaf_from_pte(ptep); + + if (unlikely(softleaf_is_migration(entry))) + qp->nr_failed++; + } + goto unlock; } - folio = pfn_folio(pte_pfn(entry)); + folio = pfn_folio(pte_pfn(ptep)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || diff --git a/mm/migrate.c b/mm/migrate.c index ca4ec170a89b..5edfd0b2f63d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -515,16 +515,18 @@ out: void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); + softleaf_t entry; pte_t pte; hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(vma->vm_mm, addr, ptep); - if (unlikely(!is_hugetlb_entry_migration(pte))) { - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - } else { + if (huge_pte_none(pte)) + goto fail; + + entry = softleaf_from_pte(pte); + if (softleaf_is_migration(entry)) { /* * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the @@ -533,7 +535,12 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p */ hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); + return; } + +fail: + spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); } #endif From 93976a20345b4aff1ac7598ec1223d65ca33d49c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:33 +0000 Subject: [PATCH 261/321] mm: eliminate further swapops predicates Having converted so much of the code base to software leaf entries, we can mop up some remaining cases. We replace is_pfn_swap_entry(), pfn_swap_entry_to_page(), is_writable_device_private_entry(), is_device_exclusive_entry(), is_migration_entry(), is_writable_migration_entry(), is_readable_migration_entry(), swp_offset_pfn() and pfn_swap_entry_folio() with softleaf equivalents. No functional change intended. Link: https://lkml.kernel.org/r/956bc9c031604811c0070d2f4bf2f1373f230213.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 14 ++--- include/linux/leafops.h | 27 ++++++--- include/linux/swapops.h | 121 +--------------------------------------- mm/debug_vm_pgtable.c | 20 +++---- mm/hmm.c | 2 +- mm/hugetlb.c | 2 +- mm/ksm.c | 6 +- mm/memory-failure.c | 6 +- mm/memory.c | 3 +- mm/mempolicy.c | 4 +- mm/migrate.c | 6 +- mm/migrate_device.c | 10 ++-- mm/mprotect.c | 8 +-- mm/page_vma_mapped.c | 8 +-- mm/pagewalk.c | 7 +-- mm/rmap.c | 9 ++- 16 files changed, 76 insertions(+), 177 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92ada14eabc0..41b062ce6ad8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1941,13 +1941,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else { - swp_entry_t entry; + softleaf_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (pm->show_pfn) { pgoff_t offset; @@ -1955,16 +1955,16 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry); + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; if (softleaf_is_guard_marker(entry)) @@ -2033,7 +2033,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); } if (page) { diff --git a/include/linux/leafops.h b/include/linux/leafops.h index f5ea9b0385ff..d282fab866a1 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -355,7 +355,7 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); /* Temporary until swp_entry_t eliminated. */ - return swp_offset_pfn(entry); + return swp_offset(entry) & SWP_PFN_MASK; } /** @@ -366,10 +366,16 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) */ static inline struct page *softleaf_to_page(softleaf_t entry) { - VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + struct page *page = pfn_to_page(softleaf_to_pfn(entry)); - /* Temporary until swp_entry_t eliminated. */ - return pfn_swap_entry_to_page(entry); + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + + return page; } /** @@ -380,10 +386,17 @@ static inline struct page *softleaf_to_page(softleaf_t entry) */ static inline struct folio *softleaf_to_folio(softleaf_t entry) { - VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); - /* Temporary until swp_entry_t eliminated. */ - return pfn_swap_entry_folio(entry); + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked. + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && + !folio_test_locked(folio)); + + return folio; } /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c8e6f927da48..3d02b288c15e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -28,7 +28,7 @@ #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* - * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To + * Definitions only for PFN swap entries (see leafeant_has_pfn()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ @@ -66,8 +66,6 @@ #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) -static inline bool is_pfn_swap_entry(swp_entry_t entry); - /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { @@ -109,17 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -/* - * This should only be called upon a pfn swap entry to get the PFN stored - * in the swap entry. Please refers to is_pfn_swap_entry() for definition - * of pfn swap entry. - */ -static inline unsigned long swp_offset_pfn(swp_entry_t entry) -{ - VM_BUG_ON(!is_pfn_swap_entry(entry)); - return swp_offset(entry) & SWP_PFN_MASK; -} - /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. @@ -169,27 +156,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_WRITE, offset); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - int type = swp_type(entry); - return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; -} - #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -201,50 +172,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - return false; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} - #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -static inline int is_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ || - swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || - swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ); -} - -static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); -} static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -310,23 +245,10 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline int is_migration_entry(swp_entry_t swp) -{ - return 0; -} - static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return 0; -} -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return 0; -} static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { @@ -410,47 +332,6 @@ static inline swp_entry_t make_guard_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) -{ - struct page *p = pfn_to_page(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - BUG_ON(is_migration_entry(entry) && !PageLocked(p)); - - return p; -} - -static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) -{ - struct folio *folio = pfn_folio(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked - */ - BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); - - return folio; -} - -/* - * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They can either be used to represent unaddressable device - * memory, to restrict access to a page undergoing migration or to represent a - * pfn which has been hwpoisoned and unmapped. - */ -static inline bool is_pfn_swap_entry(swp_entry_t entry) -{ - /* Make sure the swp offset can always store the needed fields */ - BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); - - return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); -} - struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 608d1011ce03..64db85a80558 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -844,7 +844,7 @@ static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { } static void __init swap_migration_tests(struct pgtable_debug_args *args) { struct page *page; - swp_entry_t swp; + softleaf_t entry; if (!IS_ENABLED(CONFIG_MIGRATION)) return; @@ -867,17 +867,17 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); - swp = make_writable_migration_entry(page_to_pfn(page)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(!is_writable_migration_entry(swp)); + entry = make_writable_migration_entry(page_to_pfn(page)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(!softleaf_is_migration_write(entry)); - swp = make_readable_migration_entry(swp_offset(swp)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_writable_migration_entry(swp)); + entry = make_readable_migration_entry(swp_offset(entry)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(softleaf_is_migration_write(entry)); - swp = make_readable_migration_entry(page_to_pfn(page)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_writable_migration_entry(swp)); + entry = make_readable_migration_entry(page_to_pfn(page)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(softleaf_is_migration_write(entry)); __ClearPageLocked(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 0158f2d1e027..3912d92a2b9a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -270,7 +270,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; - new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 311c5d601310..9e7815b4f058 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4934,7 +4934,7 @@ again: } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); - if (!is_readable_migration_entry(softleaf) && cow) { + if (!softleaf_is_migration_read(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. diff --git a/mm/ksm.c b/mm/ksm.c index f9a1a3658ead..cfc182255c7b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -632,14 +632,14 @@ static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long en if (pte_present(pte)) { folio = vm_normal_folio(walk->vma, addr, pte); } else if (!pte_none(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); /* * As KSM pages remain KSM pages until freed, no need to wait * here for migration to end. */ - if (is_migration_entry(entry)) - folio = pfn_swap_entry_folio(entry); + if (softleaf_is_migration(entry)) + folio = softleaf_to_folio(entry); } /* return 1 if the page is an normal ksm page or KSM-placed zero page */ found = (folio && folio_test_ksm(folio)) || diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1f7fb9bf287a..71652cfedcdf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -693,10 +693,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, if (pte_present(pte)) { pfn = pte_pfn(pte); } else { - swp_entry_t swp = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if (is_hwpoison_entry(swp)) - pfn = swp_offset_pfn(swp); + if (softleaf_is_hwpoison(entry)) + pfn = softleaf_to_pfn(entry); } if (!pfn || pfn != poisoned_pfn) diff --git a/mm/memory.c b/mm/memory.c index a3f001a47ecf..525da4479228 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -902,7 +902,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, static int try_restore_exclusive_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t orig_pte) { - struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); + const softleaf_t entry = softleaf_from_pte(orig_pte); + struct page *page = softleaf_to_page(entry); struct folio *folio = page_folio(page); if (folio_trylock(folio)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dee95d5ecfd4..acb9bf89f619 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -705,7 +705,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (pte_none(ptent)) continue; if (!pte_present(ptent)) { - if (is_migration_entry(pte_to_swp_entry(ptent))) + const softleaf_t entry = softleaf_from_pte(ptent); + + if (softleaf_is_migration(entry)) qp->nr_failed++; continue; } diff --git a/mm/migrate.c b/mm/migrate.c index 5edfd0b2f63d..c39dfea1a925 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -483,7 +483,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, spinlock_t *ptl; pte_t *ptep; pte_t pte; - swp_entry_t entry; + softleaf_t entry; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) @@ -495,8 +495,8 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (pte_none(pte) || pte_present(pte)) goto out; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) + entry = softleaf_from_pte(pte); + if (!softleaf_is_migration(entry)) goto out; migration_entry_wait_on_locked(entry, ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 592b4561507c..b1ce6e3478d6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -279,7 +279,7 @@ again: unsigned long mpfn = 0, pfn; struct folio *folio; struct page *page; - swp_entry_t entry; + softleaf_t entry; pte_t pte; pte = ptep_get(ptep); @@ -298,11 +298,11 @@ again: * page table entry. Other special swap entries are not * migratable, and we ignore regular swapped page. */ - entry = pte_to_swp_entry(pte); - if (!is_device_private_entry(entry)) + entry = softleaf_from_pte(pte); + if (!softleaf_is_device_private(entry)) goto next; - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); pgmap = page_pgmap(page); if (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || @@ -330,7 +330,7 @@ again: mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { pfn = pte_pfn(pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index f910cbf41442..283889e4f1ce 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -317,11 +317,11 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } else { - swp_entry_t entry = pte_to_swp_entry(oldpte); + softleaf_t entry = softleaf_from_pte(oldpte); pte_t newpte; - if (is_writable_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + if (softleaf_is_migration_write(entry)) { + const struct folio *folio = softleaf_to_folio(entry); /* * A protection check is difficult so @@ -335,7 +335,7 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - } else if (is_writable_device_private_entry(entry)) { + } else if (softleaf_is_device_private_write(entry)) { /* * We do not preserve soft-dirtiness. See * copy_nonpresent_pte() for explanation. diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8137d2366722..b38a1d00c971 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -49,7 +49,7 @@ again: if (is_migration) return false; } else if (!is_migration) { - swp_entry_t entry; + softleaf_t entry; /* * Handle un-addressable ZONE_DEVICE memory. @@ -67,9 +67,9 @@ again: * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(ptent); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) + entry = softleaf_from_pte(ptent); + if (!softleaf_is_device_private(entry) && + !softleaf_is_device_exclusive(entry)) return false; } spin_lock(*ptlp); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 378c774795fc..90cc346a6ecf 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1007,11 +1007,10 @@ pte_table: goto found; } } else if (!pte_none(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if ((flags & FW_MIGRATION) && - is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { + page = softleaf_to_page(entry); expose_page = false; goto found; } diff --git a/mm/rmap.c b/mm/rmap.c index 775710115a41..345466ad396b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1969,7 +1969,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2368,7 +2368,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2453,8 +2453,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, folio_mark_dirty(folio); writable = pte_write(pteval); } else { + const softleaf_t entry = softleaf_from_pte(pteval); + pte_clear(mm, address, pvmw.pte); - writable = is_writable_device_private_entry(pte_to_swp_entry(pteval)); + + writable = softleaf_is_device_private_write(entry); } VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && From a3a3e215c9c140c08760d4d96ba4e8bc485d0f14 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:34 +0000 Subject: [PATCH 262/321] mm: replace remaining pte_to_swp_entry() with softleaf_from_pte() There are straggler invocations of pte_to_swp_entry() lying around, replace all of these with the software leaf entry equivalent - softleaf_from_pte(). With those removed, eliminate pte_to_swp_entry() altogether. No functional change intended. Link: https://lkml.kernel.org/r/d8ee5ccefe4c42d7c4fe1a2e46f285ac40421cd3.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 7 ++++++- include/linux/swapops.h | 13 ------------- mm/debug_vm_pgtable.c | 2 +- mm/internal.h | 7 +++++-- mm/memory-failure.c | 2 +- mm/memory.c | 16 ++++++++-------- mm/migrate.c | 2 +- mm/mincore.c | 4 +++- mm/rmap.c | 8 ++++++-- mm/swapfile.c | 13 +++++++++++-- 10 files changed, 42 insertions(+), 32 deletions(-) diff --git a/include/linux/leafops.h b/include/linux/leafops.h index d282fab866a1..cfafe7a5e7b1 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -54,11 +54,16 @@ static inline softleaf_t softleaf_mk_none(void) */ static inline softleaf_t softleaf_from_pte(pte_t pte) { + softleaf_t arch_entry; + if (pte_present(pte) || pte_none(pte)) return softleaf_mk_none(); + pte = pte_swp_clear_flags(pte); + arch_entry = __pte_to_swp_entry(pte); + /* Temporary until swp_entry_t eliminated. */ - return pte_to_swp_entry(pte); + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3d02b288c15e..8cfc966eae48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -107,19 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -/* - * Convert the arch-dependent pte representation of a swp_entry_t into an - * arch-independent swp_entry_t. - */ -static inline swp_entry_t pte_to_swp_entry(pte_t pte) -{ - swp_entry_t arch_entry; - - pte = pte_swp_clear_flags(pte); - arch_entry = __pte_to_swp_entry(pte); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 64db85a80558..1eae87dbef73 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1229,7 +1229,7 @@ static int __init init_args(struct pgtable_debug_args *args) init_fixed_pfns(args); /* See generic_max_swapfile_size(): probe the maximum offset */ - max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + max_swap_offset = swp_offset(softleaf_from_pte(softleaf_to_pte(swp_entry(0, ~0UL)))); /* Create a swp entry with all possible bits set while still being swap. */ args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); /* Create a non-present migration entry. */ diff --git a/mm/internal.h b/mm/internal.h index 2ed041e6ebc3..929bc4a5dd98 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -334,7 +334,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); @@ -389,11 +389,14 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { + softleaf_t entry; + pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) + entry = softleaf_from_pte(pte); + if (lookup_swap_cgroup_id(entry) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 71652cfedcdf..7f908ad795ad 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/memory.c b/mm/memory.c index 525da4479228..50b93b45b174 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1218,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; - swp_entry_t entry = (swp_entry_t){0}; + softleaf_t entry = softleaf_mk_none(); struct folio *prealloc = NULL; int nr; @@ -1282,7 +1282,7 @@ again: dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(ptep_get(src_pte)); + entry = softleaf_from_pte(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -4446,13 +4446,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; - swp_entry_t entry; + softleaf_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); @@ -4470,7 +4470,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - swp_entry_t entry; + softleaf_t entry; int idx; pte_t pte; @@ -4480,7 +4480,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; @@ -4526,7 +4526,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) unsigned long orders; struct folio *folio; unsigned long addr; - swp_entry_t entry; + softleaf_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; @@ -4547,7 +4547,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!zswap_never_enabled()) goto fallback; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. diff --git a/mm/migrate.c b/mm/migrate.c index c39dfea1a925..b2ad78bf85d5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -534,7 +534,7 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); + migration_entry_wait_on_locked(entry, ptl); return; } diff --git a/mm/mincore.c b/mm/mincore.c index 9a908d8bb706..e5d13eea9234 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -202,7 +202,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - *vec = mincore_swap(pte_to_swp_entry(pte), false); + const softleaf_t entry = softleaf_from_pte(pte); + + *vec = mincore_swap(entry, false); } vec += step; } diff --git a/mm/rmap.c b/mm/rmap.c index 345466ad396b..d871f2eb821c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1969,7 +1969,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); + const softleaf_t entry = softleaf_from_pte(pteval); + + pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2368,7 +2370,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); + const softleaf_t entry = softleaf_from_pte(pteval); + + pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 8c7f14061f5b..94e0f0c54168 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3202,8 +3202,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) */ unsigned long generic_max_swapfile_size(void) { - return swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_t entry = swp_entry(0, ~0UL); + const pte_t pte = softleaf_to_pte(entry); + + /* + * Since the PTE can be an invalid softleaf entry (e.g. the none PTE), + * we need to do this manually. + */ + entry = __pte_to_swp_entry(pte); + entry = swp_entry(__swp_type(entry), __swp_offset(entry)); + + return swp_offset(entry) + 1; } /* Can be overridden by an architecture for additional checks. */ From fd603ae11e72fcaeec9266522f416c37b43bd1c4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 10 Nov 2025 16:17:55 +0800 Subject: [PATCH 263/321] mm: thp: replace folio_memcg() with folio_memcg_charged() Patch series "reparent the THP split queue", v6. In the future, we will reparent LRU folios during memcg offline to eliminate dying memory cgroups, which requires reparenting the THP split queue to its parent memcg. Similar to list_lru, the split queue is relatively independent and does not need to be reparented along with objcg and LRU folios (holding objcg lock and lru lock). Therefore, we can apply the same mechanism as list_lru to reparent the split queue first when memcg is offine. The first three patches in this series are separated from the series "Eliminate Dying Memory Cgroup" [1], mainly to do some cleanup and preparatory work. The last patch reparents the THP split queue to its parent memcg during memcg offline. This patch (of 4): folio_memcg_charged() is intended for use when the user is unconcerned about the returned memcg pointer. It is more efficient than folio_memcg(). Therefore, replace folio_memcg() with folio_memcg_charged(). Link: https://lkml.kernel.org/r/56624d537520e33e5a6b3755238b3dfb959a52ee.1762762324.git.zhengqi.arch@bytedance.com Link: https://lore.kernel.org/all/20250415024532.26632-1-songmuchun@bytedance.com/ [1] Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: David Hildenbrand Reviewed-by: Roman Gushchin Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Nico Pache Cc: Ryan Roberts Cc: Wei Yang Cc: Zi Yan Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e38b0d5e3102..9758171d49c9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4127,7 +4127,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) bool unqueued = false; WARN_ON_ONCE(folio_ref_count(folio)); - WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); From ad7c7f4576a5977b4ec4ac5dd090ab3f81ca7c6f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 10 Nov 2025 16:17:56 +0800 Subject: [PATCH 264/321] mm: thp: introduce folio_split_queue_lock and its variants In future memcg removal, the binding between a folio and a memcg may change, making the split lock within the memcg unstable when held. A new approach is required to reparent the split queue to its parent. This patch starts introducing a unified way to acquire the split lock for future work. It's a code-only refactoring with no functional changes. Link: https://lkml.kernel.org/r/a31a90bcac04dc754f775e87ae3205be3170b571.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Reviewed-by: Zi Yan Acked-by: Shakeel Butt Acked-by: David Hildenbrand Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++ mm/huge_memory.c | 119 ++++++++++++++++++++++++++----------- 2 files changed, 94 insertions(+), 35 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 966f7c1a0128..b0c6a4635c67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1647,6 +1647,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return shrinker->id; +} #else #define mem_cgroup_sockets_enabled 0 @@ -1678,6 +1683,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return -1; +} #endif #ifdef CONFIG_MEMCG diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9758171d49c9..bfcb5d895f67 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1077,28 +1077,86 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } +static struct deferred_split *split_queue_node(int nid) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + + return &pgdata->deferred_split_queue; +} + #ifdef CONFIG_MEMCG static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct mem_cgroup *memcg = folio_memcg(folio); - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + if (mem_cgroup_disabled()) + return NULL; + if (split_queue_node(folio_nid(folio)) == queue) + return NULL; + return container_of(queue, struct mem_cgroup, deferred_split_queue); +} - if (memcg) - return &memcg->deferred_split_queue; - else - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); } #else static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + return NULL; +} - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return split_queue_node(nid); } #endif +static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) +{ + struct deferred_split *queue; + + queue = memcg_split_queue(nid, memcg); + spin_lock(&queue->split_queue_lock); + + return queue; +} + +static struct deferred_split * +split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) +{ + struct deferred_split *queue; + + queue = memcg_split_queue(nid, memcg); + spin_lock_irqsave(&queue->split_queue_lock, *flags); + + return queue; +} + +static struct deferred_split *folio_split_queue_lock(struct folio *folio) +{ + return split_queue_lock(folio_nid(folio), folio_memcg(folio)); +} + +static struct deferred_split * +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) +{ + return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); +} + +static inline void split_queue_unlock(struct deferred_split *queue) +{ + spin_unlock(&queue->split_queue_lock); +} + +static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, + unsigned long flags) +{ + spin_unlock_irqrestore(&queue->split_queue_lock, flags); +} + static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) @@ -3690,7 +3748,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, struct list_head *list, enum split_type split_type, bool unmapped) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); + struct deferred_split *ds_queue; XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); @@ -3824,7 +3882,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&ds_queue->split_queue_lock); + ds_queue = folio_split_queue_lock(folio); if (folio_ref_freeze(folio, 1 + extra_pins)) { struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; @@ -3846,7 +3904,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, */ list_del_init(&folio->_deferred_list); } - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); if (mapping) { int nr = folio_nr_pages(folio); @@ -3946,7 +4004,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (ci) swap_cluster_unlock(ci); } else { - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); ret = -EAGAIN; } fail: @@ -4129,8 +4187,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) WARN_ON_ONCE(folio_ref_count(folio)); WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); - ds_queue = get_deferred_split_queue(folio); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { @@ -4141,7 +4198,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) list_del_init(&folio->_deferred_list); unqueued = true; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); return unqueued; /* useful for debug warnings */ } @@ -4149,10 +4206,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = folio_memcg(folio); -#endif + struct deferred_split *ds_queue; unsigned long flags; /* @@ -4175,7 +4229,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) if (folio_test_swapcache(folio)) return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); @@ -4190,15 +4244,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } if (list_empty(&folio->_deferred_list)) { + struct mem_cgroup *memcg; + + memcg = folio_split_queue_memcg(folio, ds_queue); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; -#ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker->id); -#endif + shrinker_id(deferred_split_shrinker)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, @@ -4244,19 +4299,13 @@ static bool thp_underused(struct folio *folio) static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct pglist_data *pgdata = NODE_DATA(sc->nid); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue; unsigned long flags; LIST_HEAD(list); struct folio *folio, *next, *prev = NULL; int split = 0, removed = 0; -#ifdef CONFIG_MEMCG - if (sc->memcg) - ds_queue = &sc->memcg->deferred_split_queue; -#endif - - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { @@ -4275,7 +4324,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, if (!--sc->nr_to_scan) break; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { bool did_split = false; From 776bde7caf80f6af72b087cafe7d9f607b14716d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 10 Nov 2025 16:17:57 +0800 Subject: [PATCH 265/321] mm: thp: use folio_batch to handle THP splitting in deferred_split_scan() The maintenance of the folio->_deferred_list is intricate because it's reused in a local list. Here are some peculiarities: 1) When a folio is removed from its split queue and added to a local on-stack list in deferred_split_scan(), the ->split_queue_len isn't updated, leading to an inconsistency between it and the actual number of folios in the split queue. 2) When the folio is split via split_folio() later, it's removed from the local list while holding the split queue lock. At this time, the lock is not needed as it is not protecting anything. 3) To handle the race condition with a third-party freeing or migrating the preceding folio, we must ensure there's always one safe (with raised refcount) folio before by delaying its folio_put(). More details can be found in commit e66f3185fa04 ("mm/thp: fix deferred split queue not partially_mapped"). It's rather tricky. We can use the folio_batch infrastructure to handle this clearly. In this case, ->split_queue_len will be consistent with the real number of folios in the split queue. If list_empty(&folio->_deferred_list) returns false, it's clear the folio must be in its split queue (not in a local list anymore). In the future, we will reparent LRU folios during memcg offline to eliminate dying memory cgroups, which requires reparenting the split queue to its parent first. So this patch prepares for using folio_split_queue_lock_irqsave() as the memcg may change then. Link: https://lkml.kernel.org/r/59cb6b6fb5ffcff9d23b81890b252960139ad8e7.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Reviewed-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Shakeel Butt Reviewed-by: Wei Yang Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 87 +++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfcb5d895f67..13684e5376e8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3888,21 +3888,22 @@ static int __folio_split(struct folio *folio, unsigned int new_order, struct lruvec *lruvec; int expected_refs; - if (old_order > 1 && - !list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; + if (old_order > 1) { + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); + } if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(old_order, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); } split_queue_unlock(ds_queue); if (mapping) { @@ -4301,35 +4302,40 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, { struct deferred_split *ds_queue; unsigned long flags; - LIST_HEAD(list); - struct folio *folio, *next, *prev = NULL; - int split = 0, removed = 0; + struct folio *folio, *next; + int split = 0, i; + struct folio_batch fbatch; + folio_batch_init(&fbatch); + +retry: ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { if (folio_try_get(folio)) { - list_move(&folio->_deferred_list, &list); - } else { + folio_batch_add(&fbatch, folio); + } else if (folio_test_partially_mapped(folio)) { /* We lost race with folio_put() */ - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; + folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } + list_del_init(&folio->_deferred_list); + ds_queue->split_queue_len--; if (!--sc->nr_to_scan) break; + if (!folio_batch_space(&fbatch)) + break; } split_queue_unlock_irqrestore(ds_queue, flags); - list_for_each_entry_safe(folio, next, &list, _deferred_list) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { bool did_split = false; bool underused = false; + struct deferred_split *fqueue; + folio = fbatch.folios[i]; if (!folio_test_partially_mapped(folio)) { /* * See try_to_map_unused_to_zeropage(): we cannot @@ -4352,38 +4358,27 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } folio_unlock(folio); next: + if (did_split || !folio_test_partially_mapped(folio)) + continue; /* - * split_folio() removes folio from list on success. * Only add back to the queue if folio is partially mapped. * If thp_underused returns false, or if split_folio fails * in the case it was underused, then consider it used and * don't add it back to split_queue. */ - if (did_split) { - ; /* folio already removed from list */ - } else if (!folio_test_partially_mapped(folio)) { - list_del_init(&folio->_deferred_list); - removed++; - } else { - /* - * That unlocked list_del_init() above would be unsafe, - * unless its folio is separated from any earlier folios - * left on the list (which may be concurrently unqueued) - * by one safe folio with refcount still raised. - */ - swap(folio, prev); + fqueue = folio_split_queue_lock_irqsave(folio, &flags); + if (list_empty(&folio->_deferred_list)) { + list_add_tail(&folio->_deferred_list, &fqueue->split_queue); + fqueue->split_queue_len++; } - if (folio) - folio_put(folio); + split_queue_unlock_irqrestore(fqueue, flags); } + folios_put(&fbatch); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - list_splice_tail(&list, &ds_queue->split_queue); - ds_queue->split_queue_len -= removed; - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - - if (prev) - folio_put(prev); + if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) { + cond_resched(); + goto retry; + } /* * Stop shrinker if we didn't split any page, but the queue is empty. From 46156dba32cb68537d36877a97d672227f3e8134 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 10 Nov 2025 16:17:58 +0800 Subject: [PATCH 266/321] mm: thp: reparent the split queue during memcg offline Similar to list_lru, the split queue is relatively independent and does not need to be reparented along with objcg and LRU folios (holding objcg lock and lru lock). So let's apply the similar mechanism as list_lru to reparent the split queue separately when memcg is offine. This is also a preparation for reparenting LRU folios. Link: https://lkml.kernel.org/r/8703f907c4d1f7e8a2ef2bfed3036a84fa53028b.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++++ include/linux/memcontrol.h | 11 ++++++++++ mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 1 + 4 files changed, 60 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 525624c285a6..e2e91aa1a042 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -415,6 +415,9 @@ static inline int split_huge_page(struct page *page) return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg); +#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); @@ -647,6 +650,7 @@ static inline int try_folio_split_to_order(struct folio *folio, } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b0c6a4635c67..cc6db20d7dca 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1775,6 +1775,12 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return memcg ? css_is_dying(&memcg->css) : false; +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1845,6 +1851,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) { } + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 13684e5376e8..d17d3810a882 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1118,8 +1118,19 @@ static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg { struct deferred_split *queue; +retry: queue = memcg_split_queue(nid, memcg); spin_lock(&queue->split_queue_lock); + /* + * There is a period between setting memcg to dying and reparenting + * deferred split queue, and during this period the THPs in the deferred + * split queue will be hidden from the shrinker side. + */ + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock(&queue->split_queue_lock); + memcg = parent_mem_cgroup(memcg); + goto retry; + } return queue; } @@ -1129,8 +1140,14 @@ split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags { struct deferred_split *queue; +retry: queue = memcg_split_queue(nid, memcg); spin_lock_irqsave(&queue->split_queue_lock, *flags); + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); + memcg = parent_mem_cgroup(memcg); + goto retry; + } return queue; } @@ -4389,6 +4406,33 @@ next: return split; } +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct deferred_split *ds_queue = &memcg->deferred_split_queue; + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; + int nid; + + spin_lock_irq(&ds_queue->split_queue_lock); + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); + + if (!ds_queue->split_queue_len) + goto unlock; + + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; + ds_queue->split_queue_len = 0; + + for_each_node(nid) + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); + +unlock: + spin_unlock(&parent_ds_queue->split_queue_lock); + spin_unlock_irq(&ds_queue->split_queue_lock); +} +#endif + #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bfc986da3289..623446821b00 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3920,6 +3920,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); memcg_offline_kmem(memcg); + reparent_deferred_split_queue(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); From eaa4c8063f7c3b78617c3f5af14d35a782c88144 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 14 Nov 2025 03:00:26 +0000 Subject: [PATCH 267/321] mm/khugepaged: remove redundant clearing of struct collapse_control Patch series "unify PMD scan results and remove redundant cleanup", v2. This small series addresses two minor cleanup opportunities in the hugepage collapse logic. The initial motivation arose during a code review of madvise_collapse(), where it was noted that the function was missing a handler for SCAN_PMD_NONE. This oversight exposed the inconsistent handling of SCAN_PMD_NULL and SCAN_PMD_NONE. Since both scan results are functionally identical (they indicate the absence of a PTE table), the primary patch unifies them into a single, clearer identifier, SCAN_NO_PTE_TABLE. The series also takes the opportunity to remove a redundant clearing of the struct collapse_control. This patch (of 3): The structure struct collapse_control is being unnecessarily cleared twice during the huge page collapse process. Both hpage_collapse_scan_file() and hpage_collapse_scan_pmd() currently perform a clear operation on this structure. Remove the redundant clear operation. Link: https://lkml.kernel.org/r/20251114030028.7035-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251114030028.7035-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Dev Jain Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Baolin Wang Reviewed-by: Nico Pache Cc: Barry Song Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7e8cb181d5bd..1fc8986a28b3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2803,8 +2803,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); - memset(cc->node_load, 0, sizeof(cc->node_load)); - nodes_clear(cc->alloc_nmask); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); From f1040f889882dda62b50ae948409f21afc7d894d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 14 Nov 2025 03:00:27 +0000 Subject: [PATCH 268/321] mm/khugepaged: continue to collapse on SCAN_PMD_NONE SCAN_PMD_NONE means current pmd is empty, but we can still continue collapse next pmd range. Link: https://lkml.kernel.org/r/20251114030028.7035-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Dev Jain Reviewed-by: Baolin Wang Acked-by: David Hildenbrand (Red Hat) Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1fc8986a28b3..2ee5048b764e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2833,6 +2833,7 @@ handle_result: goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: + case SCAN_PMD_NONE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: From 9e014077083753461938312d565e4ac7119570d1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 14 Nov 2025 03:00:28 +0000 Subject: [PATCH 269/321] mm/khugepaged: unify SCAN_PMD_NONE and SCAN_PMD_NULL into SCAN_NO_PTE_TABLE The current hugepage collapse scan results include two separate values, SCAN_PMD_NONE and SCAN_PMD_NULL, which are handled identically by the consuming code. To reduce confusion and improve long-term maintenance, this commit merges these two functionally equivalent states into a single, clearer identifier: SCAN_NO_PTE_TABLE Link: https://lkml.kernel.org/r/20251114030028.7035-4-richard.weiyang@gmail.com Suggested-by: "David Hildenbrand (Red Hat)" Signed-off-by: Wei Yang Reviewed-by: Dev Jain Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Baolin Wang Reviewed-by: Nico Pache Cc: Barry Song Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 3 +-- mm/khugepaged.c | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index dd94d14a2427..4cde53b45a85 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -10,8 +10,7 @@ #define SCAN_STATUS \ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ - EM( SCAN_PMD_NULL, "pmd_null") \ - EM( SCAN_PMD_NONE, "pmd_none") \ + EM( SCAN_NO_PTE_TABLE, "no_pte_table") \ EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2ee5048b764e..40f9d5939aa5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -30,8 +30,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, - SCAN_PMD_NULL, - SCAN_PMD_NONE, + SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -934,7 +933,7 @@ static inline int check_pmd_state(pmd_t *pmd) pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) - return SCAN_PMD_NONE; + return SCAN_NO_PTE_TABLE; /* * The folio may be under migration when khugepaged is trying to @@ -944,11 +943,11 @@ static inline int check_pmd_state(pmd_t *pmd) if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } @@ -958,7 +957,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, { *pmd = mm_find_pmd(mm, address); if (!*pmd) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return check_pmd_state(*pmd); } @@ -1013,7 +1012,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } } @@ -1187,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, &compound_pagelist); spin_unlock(pte_ptl); } else { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { @@ -1270,7 +1269,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } @@ -1544,8 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; - case SCAN_PMD_NULL: - case SCAN_PMD_NONE: + case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. @@ -2832,8 +2830,7 @@ handle_result: mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ - case SCAN_PMD_NULL: - case SCAN_PMD_NONE: + case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: From f405066a1f0db818270f49a5e96be329bcabde1e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Nov 2025 16:40:24 +0900 Subject: [PATCH 270/321] zram: introduce writeback bio batching Patch series "zram: introduce writeback bio batching", v6. As writeback is becoming more and more common the longstanding limitations of zram writeback throughput are becoming more visible. Introduce writeback bio batching so that multiple writeback bios can be processed simultaneously. This patch (of 6): As was stated in a comment [1] a single page writeback IO is not efficient, but it works. It's time to address this throughput limitation as writeback becomes used more often. Introduce batched (multiple) bio writeback support to take advantage of parallel requests processing and better requests scheduling. Approach used in this patch doesn't use a dedicated kthread like in [2], or blk-plug like in [3]. Dedicated kthread adds complexity, which can be avoided. Apart from that not all zram setups use writeback, so having numerous per-device kthreads (on systems that create multiple zram devices) hanging around is not the most optimal thing to do. blk-plug, on the other hand, works best when request are sequential, which doesn't particularly fit zram writebck IO patterns: zram writeback IO patterns are expected to be random, due to how bdev block reservation/release are handled. blk-plug approach also works in cycles: idle IO, when zram sets up requests in a batch, is followed by bursts of IO, when zram submits the entire batch. Instead we use a batch of requests and submit new bio as soon as one of the in-flight requests completes. For the time being the writeback batch size (maximum number of in-flight bio requests) is set to 32 for all devices. A follow up patch adds a writeback_batch_size device attribute, so the batch size becomes run-time configurable. Link: https://lkml.kernel.org/r/20251122074029.3948921-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20251122074029.3948921-2-senozhatsky@chromium.org Link: https://lore.kernel.org/all/20181203024045.153534-6-minchan@kernel.org/ [1] Link: https://lore.kernel.org/all/20250731064949.1690732-1-richardycc@google.com/ [2] Link: https://lore.kernel.org/all/tencent_78FC2C4FE16BA1EBAF0897DB60FCD675ED05@qq.com/ [3] Signed-off-by: Sergey Senozhatsky Co-developed-by: Yuwen Chen Co-developed-by: Richard Chang Suggested-by: Minchan Kim Cc: Brian Geffon Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 371 +++++++++++++++++++++++++++------- 1 file changed, 302 insertions(+), 69 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a43074657531..06ea56f0a00f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -500,6 +500,26 @@ out: } #ifdef CONFIG_ZRAM_WRITEBACK +struct zram_wb_ctl { + /* idle list is accessed only by the writeback task, no concurency */ + struct list_head idle_reqs; + /* done list is accessed concurrently, protect by done_lock */ + struct list_head done_reqs; + wait_queue_head_t done_wait; + spinlock_t done_lock; + atomic_t num_inflight; +}; + +struct zram_wb_req { + unsigned long blk_idx; + struct page *page; + struct zram_pp_slot *pps; + struct bio_vec bio_vec; + struct bio bio; + + struct list_head entry; +}; + static ssize_t writeback_limit_enable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -734,19 +754,221 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) +static void release_wb_req(struct zram_wb_req *req) { - unsigned long blk_idx = 0; - struct page *page = NULL; - struct zram_pp_slot *pps; - struct bio_vec bio_vec; - struct bio bio; - int ret = 0, err; - u32 index; + __free_page(req->page); + kfree(req); +} - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; +static void release_wb_ctl(struct zram_wb_ctl *wb_ctl) +{ + if (!wb_ctl) + return; + + /* We should never have inflight requests at this point */ + WARN_ON(atomic_read(&wb_ctl->num_inflight)); + WARN_ON(!list_empty(&wb_ctl->done_reqs)); + + while (!list_empty(&wb_ctl->idle_reqs)) { + struct zram_wb_req *req; + + req = list_first_entry(&wb_ctl->idle_reqs, + struct zram_wb_req, entry); + list_del(&req->entry); + release_wb_req(req); + } + + kfree(wb_ctl); +} + +/* XXX: should be a per-device sysfs attr */ +#define ZRAM_WB_REQ_CNT 32 + +static struct zram_wb_ctl *init_wb_ctl(void) +{ + struct zram_wb_ctl *wb_ctl; + int i; + + wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL); + if (!wb_ctl) + return NULL; + + INIT_LIST_HEAD(&wb_ctl->idle_reqs); + INIT_LIST_HEAD(&wb_ctl->done_reqs); + atomic_set(&wb_ctl->num_inflight, 0); + init_waitqueue_head(&wb_ctl->done_wait); + spin_lock_init(&wb_ctl->done_lock); + + for (i = 0; i < ZRAM_WB_REQ_CNT; i++) { + struct zram_wb_req *req; + + /* + * This is fatal condition only if we couldn't allocate + * any requests at all. Otherwise we just work with the + * requests that we have successfully allocated, so that + * writeback can still proceed, even if there is only one + * request on the idle list. + */ + req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_NOWARN); + if (!req) + break; + + req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN); + if (!req->page) { + kfree(req); + break; + } + + list_add(&req->entry, &wb_ctl->idle_reqs); + } + + /* We couldn't allocate any requests, so writeabck is not possible */ + if (list_empty(&wb_ctl->idle_reqs)) + goto release_wb_ctl; + + return wb_ctl; + +release_wb_ctl: + release_wb_ctl(wb_ctl); + return NULL; +} + +static void zram_account_writeback_rollback(struct zram *zram) +{ + spin_lock(&zram->wb_limit_lock); + if (zram->wb_limit_enable) + zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12); + spin_unlock(&zram->wb_limit_lock); +} + +static void zram_account_writeback_submit(struct zram *zram) +{ + spin_lock(&zram->wb_limit_lock); + if (zram->wb_limit_enable && zram->bd_wb_limit > 0) + zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); + spin_unlock(&zram->wb_limit_lock); +} + +static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) +{ + u32 index = req->pps->index; + int err; + + err = blk_status_to_errno(req->bio.bi_status); + if (err) { + /* + * Failed wb requests should not be accounted in wb_limit + * (if enabled). + */ + zram_account_writeback_rollback(zram); + free_block_bdev(zram, req->blk_idx); + return err; + } + + atomic64_inc(&zram->stats.bd_writes); + zram_slot_lock(zram, index); + /* + * We release slot lock during writeback so slot can change under us: + * slot_free() or slot_free() and zram_write_page(). In both cases + * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can + * set ZRAM_PP_SLOT on such slots until current post-processing + * finishes. + */ + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) { + free_block_bdev(zram, req->blk_idx); + goto out; + } + + zram_free_page(zram, index); + zram_set_flag(zram, index, ZRAM_WB); + zram_set_handle(zram, index, req->blk_idx); + atomic64_inc(&zram->stats.pages_stored); + +out: + zram_slot_unlock(zram, index); + return 0; +} + +static void zram_writeback_endio(struct bio *bio) +{ + struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio); + struct zram_wb_ctl *wb_ctl = bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&wb_ctl->done_lock, flags); + list_add(&req->entry, &wb_ctl->done_reqs); + spin_unlock_irqrestore(&wb_ctl->done_lock, flags); + + wake_up(&wb_ctl->done_wait); +} + +static void zram_submit_wb_request(struct zram *zram, + struct zram_wb_ctl *wb_ctl, + struct zram_wb_req *req) +{ + /* + * wb_limit (if enabled) should be adjusted before submission, + * so that we don't over-submit. + */ + zram_account_writeback_submit(zram); + atomic_inc(&wb_ctl->num_inflight); + req->bio.bi_private = wb_ctl; + submit_bio(&req->bio); +} + +static int zram_complete_done_reqs(struct zram *zram, + struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req; + unsigned long flags; + int ret = 0, err; + + while (atomic_read(&wb_ctl->num_inflight) > 0) { + spin_lock_irqsave(&wb_ctl->done_lock, flags); + req = list_first_entry_or_null(&wb_ctl->done_reqs, + struct zram_wb_req, entry); + if (req) + list_del(&req->entry); + spin_unlock_irqrestore(&wb_ctl->done_lock, flags); + + /* ->num_inflight > 0 doesn't mean we have done requests */ + if (!req) + break; + + err = zram_writeback_complete(zram, req); + if (err) + ret = err; + + atomic_dec(&wb_ctl->num_inflight); + release_pp_slot(zram, req->pps); + req->pps = NULL; + + list_add(&req->entry, &wb_ctl->idle_reqs); + } + + return ret; +} + +static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req; + + req = list_first_entry_or_null(&wb_ctl->idle_reqs, + struct zram_wb_req, entry); + if (req) + list_del(&req->entry); + return req; +} + +static int zram_writeback_slots(struct zram *zram, + struct zram_pp_ctl *ctl, + struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req = NULL; + unsigned long blk_idx = 0; + struct zram_pp_slot *pps; + int ret = 0, err = 0; + u32 index = 0; while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); @@ -757,6 +979,27 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) } spin_unlock(&zram->wb_limit_lock); + while (!req) { + req = zram_select_idle_req(wb_ctl); + if (req) + break; + + wait_event(wb_ctl->done_wait, + !list_empty(&wb_ctl->done_reqs)); + + err = zram_complete_done_reqs(zram, wb_ctl); + /* + * BIO errors are not fatal, we continue and simply + * attempt to writeback the remaining objects (pages). + * At the same time we need to signal user-space that + * some writes (at least one, but also could be all of + * them) were not successful and we do so by returning + * the most recent BIO error. + */ + if (err) + ret = err; + } + if (!blk_idx) { blk_idx = alloc_block_bdev(zram); if (!blk_idx) { @@ -775,67 +1018,47 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - if (zram_read_from_zspool(zram, page, index)) + if (zram_read_from_zspool(zram, req->page, index)) goto next; zram_slot_unlock(zram, index); - bio_init(&bio, zram->bdev, &bio_vec, 1, - REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); - __bio_add_page(&bio, page, PAGE_SIZE, 0); - /* - * XXX: A single page IO would be inefficient for write - * but it would be not bad as starter. + * From now on pp-slot is owned by the req, remove it from + * its pp bucket. */ - err = submit_bio_wait(&bio); - if (err) { - release_pp_slot(zram, pps); - /* - * BIO errors are not fatal, we continue and simply - * attempt to writeback the remaining objects (pages). - * At the same time we need to signal user-space that - * some writes (at least one, but also could be all of - * them) were not successful and we do so by returning - * the most recent BIO error. - */ - ret = err; - continue; - } + list_del_init(&pps->entry); - atomic64_inc(&zram->stats.bd_writes); - zram_slot_lock(zram, index); - /* - * Same as above, we release slot lock during writeback so - * slot can change under us: slot_free() or slot_free() and - * reallocation (zram_write_page()). In both cases slot loses - * ZRAM_PP_SLOT flag. No concurrent post-processing can set - * ZRAM_PP_SLOT on such slots until current post-processing - * finishes. - */ - if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) - goto next; + req->blk_idx = blk_idx; + req->pps = pps; + bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE); + req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9); + req->bio.bi_end_io = zram_writeback_endio; + __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0); - zram_free_page(zram, index); - zram_set_flag(zram, index, ZRAM_WB); - zram_set_handle(zram, index, blk_idx); + zram_submit_wb_request(zram, wb_ctl, req); blk_idx = 0; - atomic64_inc(&zram->stats.pages_stored); - spin_lock(&zram->wb_limit_lock); - if (zram->wb_limit_enable && zram->bd_wb_limit > 0) - zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); - spin_unlock(&zram->wb_limit_lock); + req = NULL; + cond_resched(); + continue; + next: zram_slot_unlock(zram, index); release_pp_slot(zram, pps); - - cond_resched(); } - if (blk_idx) - free_block_bdev(zram, blk_idx); - if (page) - __free_page(page); + /* + * Selected idle req, but never submitted it due to some error or + * wb limit. + */ + if (req) + release_wb_req(req); + + while (atomic_read(&wb_ctl->num_inflight) > 0) { + wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs)); + err = zram_complete_done_reqs(zram, wb_ctl); + if (err) + ret = err; + } return ret; } @@ -948,7 +1171,8 @@ static ssize_t writeback_store(struct device *dev, struct zram *zram = dev_to_zram(dev); u64 nr_pages = zram->disksize >> PAGE_SHIFT; unsigned long lo = 0, hi = nr_pages; - struct zram_pp_ctl *ctl = NULL; + struct zram_pp_ctl *pp_ctl = NULL; + struct zram_wb_ctl *wb_ctl = NULL; char *args, *param, *val; ssize_t ret = len; int err, mode = 0; @@ -970,8 +1194,14 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - ctl = init_pp_ctl(); - if (!ctl) { + pp_ctl = init_pp_ctl(); + if (!pp_ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + wb_ctl = init_wb_ctl(); + if (!wb_ctl) { ret = -ENOMEM; goto release_init_lock; } @@ -1000,7 +1230,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); break; } @@ -1011,7 +1241,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); break; } @@ -1022,7 +1252,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); continue; } @@ -1033,17 +1263,18 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); continue; } } - err = zram_writeback_slots(zram, ctl); + err = zram_writeback_slots(zram, pp_ctl, wb_ctl); if (err) ret = err; release_init_lock: - release_pp_ctl(zram, ctl); + release_pp_ctl(zram, pp_ctl); + release_wb_ctl(wb_ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); @@ -1112,7 +1343,9 @@ static int read_from_bdev(struct zram *zram, struct page *page, return -EIO; } -static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {}; +static void free_block_bdev(struct zram *zram, unsigned long blk_idx) +{ +} #endif #ifdef CONFIG_ZRAM_MEMORY_TRACKING From e828cccb72ed8661b5d778baae9442cd06da4e0e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Nov 2025 16:40:25 +0900 Subject: [PATCH 271/321] zram: add writeback batch size device attr Introduce writeback_batch_size device attribute so that the maximum number of in-flight writeback bio requests can be configured at run-time per-device. This essentially enables batched bio writeback. Link: https://lkml.kernel.org/r/20251122074029.3948921-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Cc: Yuwen Chen Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 46 ++++++++++++++++++++++++++++++----- drivers/block/zram/zram_drv.h | 1 + 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 06ea56f0a00f..5906ba061165 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -590,6 +590,40 @@ static ssize_t writeback_limit_show(struct device *dev, return sysfs_emit(buf, "%llu\n", val); } +static ssize_t writeback_batch_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u32 val; + + if (kstrtouint(buf, 10, &val)) + return -EINVAL; + + if (!val) + return -EINVAL; + + down_write(&zram->init_lock); + zram->wb_batch_size = val; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t writeback_batch_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + u32 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->wb_batch_size; + up_read(&zram->init_lock); + + return sysfs_emit(buf, "%u\n", val); +} + static void reset_bdev(struct zram *zram) { if (!zram->backing_dev) @@ -781,10 +815,7 @@ static void release_wb_ctl(struct zram_wb_ctl *wb_ctl) kfree(wb_ctl); } -/* XXX: should be a per-device sysfs attr */ -#define ZRAM_WB_REQ_CNT 32 - -static struct zram_wb_ctl *init_wb_ctl(void) +static struct zram_wb_ctl *init_wb_ctl(struct zram *zram) { struct zram_wb_ctl *wb_ctl; int i; @@ -799,7 +830,7 @@ static struct zram_wb_ctl *init_wb_ctl(void) init_waitqueue_head(&wb_ctl->done_wait); spin_lock_init(&wb_ctl->done_lock); - for (i = 0; i < ZRAM_WB_REQ_CNT; i++) { + for (i = 0; i < zram->wb_batch_size; i++) { struct zram_wb_req *req; /* @@ -1200,7 +1231,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - wb_ctl = init_wb_ctl(); + wb_ctl = init_wb_ctl(zram); if (!wb_ctl) { ret = -ENOMEM; goto release_init_lock; @@ -2843,6 +2874,7 @@ static DEVICE_ATTR_RW(backing_dev); static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); +static DEVICE_ATTR_RW(writeback_batch_size); #endif #ifdef CONFIG_ZRAM_MULTI_COMP static DEVICE_ATTR_RW(recomp_algorithm); @@ -2864,6 +2896,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_writeback.attr, &dev_attr_writeback_limit.attr, &dev_attr_writeback_limit_enable.attr, + &dev_attr_writeback_batch_size.attr, #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, @@ -2925,6 +2958,7 @@ static int zram_add(void) init_rwsem(&zram->init_lock); #ifdef CONFIG_ZRAM_WRITEBACK + zram->wb_batch_size = 32; spin_lock_init(&zram->wb_limit_lock); #endif diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 6cee93f9c0d0..1a647f42c1a4 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -129,6 +129,7 @@ struct zram { struct file *backing_dev; spinlock_t wb_limit_lock; bool wb_limit_enable; + u32 wb_batch_size; u64 bd_wb_limit; struct block_device *bdev; unsigned long *bitmap; From 7c929664fddfdaaa4afe5ae833d0f3044709d95c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Nov 2025 16:40:26 +0900 Subject: [PATCH 272/321] zram: take write lock in wb limit store handlers Write device attrs handlers should take write zram init_lock. While at it, fixup coding styles. Link: https://lkml.kernel.org/r/20251122074029.3948921-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Cc: Yuwen Chen Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5906ba061165..8dd733707a40 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -521,7 +521,8 @@ struct zram_wb_req { }; static ssize_t writeback_limit_enable_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, + const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); u64 val; @@ -530,18 +531,19 @@ static ssize_t writeback_limit_enable_store(struct device *dev, if (kstrtoull(buf, 10, &val)) return ret; - down_read(&zram->init_lock); + down_write(&zram->init_lock); spin_lock(&zram->wb_limit_lock); zram->wb_limit_enable = val; spin_unlock(&zram->wb_limit_lock); - up_read(&zram->init_lock); + up_write(&zram->init_lock); ret = len; return ret; } static ssize_t writeback_limit_enable_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, + char *buf) { bool val; struct zram *zram = dev_to_zram(dev); @@ -556,7 +558,8 @@ static ssize_t writeback_limit_enable_show(struct device *dev, } static ssize_t writeback_limit_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, + const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); u64 val; @@ -565,11 +568,11 @@ static ssize_t writeback_limit_store(struct device *dev, if (kstrtoull(buf, 10, &val)) return ret; - down_read(&zram->init_lock); + down_write(&zram->init_lock); spin_lock(&zram->wb_limit_lock); zram->bd_wb_limit = val; spin_unlock(&zram->wb_limit_lock); - up_read(&zram->init_lock); + up_write(&zram->init_lock); ret = len; return ret; From a4f506c569e1320c2db4a32955e47961fcf02b05 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Nov 2025 16:40:27 +0900 Subject: [PATCH 273/321] zram: drop wb_limit_lock We don't need wb_limit_lock. Writeback limit setters take an exclusive write zram init_lock, while wb_limit modifications happen only from a single task and under zram read init_lock. No concurrent wb_limit modifications are possible (we permit only one post-processing task at a time). Add lockdep assertions to wb_limit mutators. While at it, fixup coding styles. Link: https://lkml.kernel.org/r/20251122074029.3948921-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Cc: Yuwen Chen Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 22 +++++----------------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8dd733707a40..806497225603 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -532,9 +532,7 @@ static ssize_t writeback_limit_enable_store(struct device *dev, return ret; down_write(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); zram->wb_limit_enable = val; - spin_unlock(&zram->wb_limit_lock); up_write(&zram->init_lock); ret = len; @@ -549,9 +547,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev, struct zram *zram = dev_to_zram(dev); down_read(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); val = zram->wb_limit_enable; - spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); return sysfs_emit(buf, "%d\n", val); @@ -569,9 +565,7 @@ static ssize_t writeback_limit_store(struct device *dev, return ret; down_write(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); zram->bd_wb_limit = val; - spin_unlock(&zram->wb_limit_lock); up_write(&zram->init_lock); ret = len; @@ -579,15 +573,13 @@ static ssize_t writeback_limit_store(struct device *dev, } static ssize_t writeback_limit_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { u64 val; struct zram *zram = dev_to_zram(dev); down_read(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); val = zram->bd_wb_limit; - spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); return sysfs_emit(buf, "%llu\n", val); @@ -869,18 +861,18 @@ release_wb_ctl: static void zram_account_writeback_rollback(struct zram *zram) { - spin_lock(&zram->wb_limit_lock); + lockdep_assert_held_read(&zram->init_lock); + if (zram->wb_limit_enable) zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12); - spin_unlock(&zram->wb_limit_lock); } static void zram_account_writeback_submit(struct zram *zram) { - spin_lock(&zram->wb_limit_lock); + lockdep_assert_held_read(&zram->init_lock); + if (zram->wb_limit_enable && zram->bd_wb_limit > 0) zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); - spin_unlock(&zram->wb_limit_lock); } static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) @@ -1005,13 +997,10 @@ static int zram_writeback_slots(struct zram *zram, u32 index = 0; while ((pps = select_pp_slot(ctl))) { - spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { - spin_unlock(&zram->wb_limit_lock); ret = -EIO; break; } - spin_unlock(&zram->wb_limit_lock); while (!req) { req = zram_select_idle_req(wb_ctl); @@ -2962,7 +2951,6 @@ static int zram_add(void) init_rwsem(&zram->init_lock); #ifdef CONFIG_ZRAM_WRITEBACK zram->wb_batch_size = 32; - spin_lock_init(&zram->wb_limit_lock); #endif /* gendisk structure */ diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1a647f42c1a4..c6d94501376c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -127,7 +127,6 @@ struct zram { bool claim; /* Protected by disk->open_mutex */ #ifdef CONFIG_ZRAM_WRITEBACK struct file *backing_dev; - spinlock_t wb_limit_lock; bool wb_limit_enable; u32 wb_batch_size; u64 bd_wb_limit; From e87ddea34567dd4e5cb1f2c9e02778485b3c9757 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Nov 2025 16:40:28 +0900 Subject: [PATCH 274/321] zram: rework bdev block allocation First, writeback bdev ->bitmap bits are set only from one context, as we can have only one single task performing writeback, so we cannot race with anything else. Remove retry path. Second, we always check ZRAM_WB flag to distinguish writtenback slots, so we should not confuse 0 bdev block index and 0 handle. We can use first bdev block (0 bit) for writeback as well. While at it, give functions slightly more accurate names, as we don't alloc/free anything there, we reserve a block for async writeback or release the block. Link: https://lkml.kernel.org/r/20251122074029.3948921-6-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Cc: Yuwen Chen Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 37 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 806497225603..1f7e9e914d34 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -500,6 +500,8 @@ out: } #ifdef CONFIG_ZRAM_WRITEBACK +#define INVALID_BDEV_BLOCK (~0UL) + struct zram_wb_ctl { /* idle list is accessed only by the writeback task, no concurency */ struct list_head idle_reqs; @@ -746,23 +748,20 @@ out: return err; } -static unsigned long alloc_block_bdev(struct zram *zram) +static unsigned long zram_reserve_bdev_block(struct zram *zram) { - unsigned long blk_idx = 1; -retry: - /* skip 0 bit to confuse zram.handle = 0 */ - blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx); + unsigned long blk_idx; + + blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0); if (blk_idx == zram->nr_pages) - return 0; - - if (test_and_set_bit(blk_idx, zram->bitmap)) - goto retry; + return INVALID_BDEV_BLOCK; + set_bit(blk_idx, zram->bitmap); atomic64_inc(&zram->stats.bd_count); return blk_idx; } -static void free_block_bdev(struct zram *zram, unsigned long blk_idx) +static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx) { int was_set; @@ -887,7 +886,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) * (if enabled). */ zram_account_writeback_rollback(zram); - free_block_bdev(zram, req->blk_idx); + zram_release_bdev_block(zram, req->blk_idx); return err; } @@ -901,7 +900,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) * finishes. */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) { - free_block_bdev(zram, req->blk_idx); + zram_release_bdev_block(zram, req->blk_idx); goto out; } @@ -990,8 +989,8 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl, struct zram_wb_ctl *wb_ctl) { + unsigned long blk_idx = INVALID_BDEV_BLOCK; struct zram_wb_req *req = NULL; - unsigned long blk_idx = 0; struct zram_pp_slot *pps; int ret = 0, err = 0; u32 index = 0; @@ -1023,9 +1022,9 @@ static int zram_writeback_slots(struct zram *zram, ret = err; } - if (!blk_idx) { - blk_idx = alloc_block_bdev(zram); - if (!blk_idx) { + if (blk_idx == INVALID_BDEV_BLOCK) { + blk_idx = zram_reserve_bdev_block(zram); + if (blk_idx == INVALID_BDEV_BLOCK) { ret = -ENOSPC; break; } @@ -1059,7 +1058,7 @@ static int zram_writeback_slots(struct zram *zram, __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0); zram_submit_wb_request(zram, wb_ctl, req); - blk_idx = 0; + blk_idx = INVALID_BDEV_BLOCK; req = NULL; cond_resched(); continue; @@ -1366,7 +1365,7 @@ static int read_from_bdev(struct zram *zram, struct page *page, return -EIO; } -static void free_block_bdev(struct zram *zram, unsigned long blk_idx) +static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx) { } #endif @@ -1890,7 +1889,7 @@ static void zram_free_page(struct zram *zram, size_t index) if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); - free_block_bdev(zram, zram_get_handle(zram, index)); + zram_release_bdev_block(zram, zram_get_handle(zram, index)); goto out; } From 1b1a4e4d6797a57fefa40569fc920ce573bbf75b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Nov 2025 16:40:29 +0900 Subject: [PATCH 275/321] zram: read slot block idx under slot lock Read slot's block id under slot-lock. We release the slot-lock for bdev read so, technically, slot still can get freed in the meantime, but at least we will read bdev block (page) that holds previous know slot data, not from slot->handle bdev block, which can be anything at that point. Link: https://lkml.kernel.org/r/20251122074029.3948921-7-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Cc: Yuwen Chen Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1f7e9e914d34..3428f647d0a7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1995,14 +1995,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, ret = zram_read_from_zspool(zram, page, index); zram_slot_unlock(zram, index); } else { + unsigned long blk_idx = zram_get_handle(zram, index); + /* * The slot should be unlocked before reading from the backing * device. */ zram_slot_unlock(zram, index); - - ret = read_from_bdev(zram, page, zram_get_handle(zram, index), - parent); + ret = read_from_bdev(zram, page, blk_idx, parent); } /* Should NEVER happen. Return bio error if it does. */ From 8826f09616b475361774588c3e07260bba548f84 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Nov 2025 08:46:32 +0800 Subject: [PATCH 276/321] mm: shmem: allow fallback to smaller large orders for tmpfs mmap() access After commit 69e0a3b49003 ("mm: shmem: fix the strategy for the tmpfs 'huge=' options"), we have fixed the large order allocation strategy for tmpfs, which always tries PMD-sized large folios first, and if that fails, falls back to smaller large folios. For tmpfs large folio allocation via mmap(), we should maintain the same strategy as well. Let's unify the large order allocation strategy for tmpfs. There is no functional change for large folio allocation of anonymous shmem. Link: https://lkml.kernel.org/r/283a0bdfd6ac7aa334a491422bcae70919c572bd.1763008453.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 395ca58ac4a5..fc835b3e4914 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -645,34 +645,23 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * - * For tmpfs mmap()'s huge order, we still use PMD-sized order to - * allocate huge pages due to lack of a write size hint. - * * For tmpfs with 'huge=always' or 'huge=within_size' mount option, * we will always try PMD-sized order first. If that failed, it will * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - if (vma) - return maybe_pmd_order; - return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: - if (vma) - within_size_orders = maybe_pmd_order; - else - within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT; - - within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, - index, write_end); + within_size_orders = shmem_get_orders_within_size(inode, + THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) - return maybe_pmd_order; + return THP_ORDERS_ALL_FILE_DEFAULT; fallthrough; default: return 0; From cab812d9c9642ec11b8961b7ea994f4bd0826159 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 14 Nov 2025 12:22:28 +1100 Subject: [PATCH 277/321] mm/huge_memory.c: introduce folio_split_unmapped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unmapped was added as a parameter to __folio_split() and related call sites to support splitting of folios already in the midst of a migration. This special case arose for device private folio migration since during migration there could be a disconnect between source and destination on the folio size. Introduce folio_split_unmapped() to handle this special case. Also refactor code and add __folio_freeze_and_split_unmapped() helper that is common to both __folio_split() and folio_split_unmapped(). This in turn removes the special casing introduced by the unmapped parameter in __folio_split(). [balbirs@nvidia.com: v2] Link: https://lkml.kernel.org/r/20251115084041.3914728-1-balbirs@nvidia.com [balbirs@nvidia.com: fix clang-20 build] Link: https://lkml.kernel.org/r/20251120134232.3588203-1-balbirs@nvidia.com [akpm@linux-foundation.org: add `inline' to shmem_uncharge() stub, per Balbir] Link: https://lkml.kernel.org/r/20251114012228.2634882-1-balbirs@nvidia.com Signed-off-by: Balbir Singh Suggested-by: Zi Yan Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +- include/linux/shmem_fs.h | 6 +- mm/huge_memory.c | 348 +++++++++++++++++++++++---------------- mm/migrate_device.c | 3 +- 4 files changed, 211 insertions(+), 151 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e2e91aa1a042..1d439de1ca2c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -371,7 +371,8 @@ enum split_type { bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped); + unsigned int new_order); +int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool folio_split_supported(struct folio *folio, unsigned int new_order, @@ -382,7 +383,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page, static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return __split_huge_page_to_list_to_order(page, list, new_order, false); + return __split_huge_page_to_list_to_order(page, list, new_order); } static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) { diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 5b368f9549d6..d02270072a34 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -136,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void) #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +extern void shmem_uncharge(struct inode *inode, long pages); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } + +static inline void shmem_uncharge(struct inode *inode, long pages) +{ +} #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -194,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) } extern bool shmem_charge(struct inode *inode, long pages); -extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d17d3810a882..53a8d380eab2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3739,6 +3739,152 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, return true; } +static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool do_lru, + struct list_head *list, enum split_type split_type, + pgoff_t end, int *nr_shmem_dropped, int extra_pins) +{ + struct folio *end_folio = folio_next(folio); + struct folio *new_folio, *next; + int old_order = folio_order(folio); + int ret = 0; + struct deferred_split *ds_queue; + + VM_WARN_ON_ONCE(!mapping && end); + /* Prevent deferred_split_scan() touching ->_refcount */ + ds_queue = folio_split_queue_lock(folio); + if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct swap_cluster_info *ci = NULL; + struct lruvec *lruvec; + int expected_refs; + + if (old_order > 1) { + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); + } + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(old_order, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } + } + split_queue_unlock(ds_queue); + if (mapping) { + int nr = folio_nr_pages(folio); + + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + } + } + + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + return -EINVAL; + } + + ci = swap_cluster_get_and_lock(folio); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + if (do_lru) + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, xas, + mapping, split_type); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + zone_device_private_split_cb(folio, new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + if (do_lru) + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + VM_WARN_ON_ONCE(!nr_shmem_dropped); + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping) && nr_shmem_dropped) + *nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + + zone_device_private_split_cb(folio, NULL); + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + if (do_lru) + unlock_page_lruvec(lruvec); + + if (ci) + swap_cluster_unlock(ci); + } else { + split_queue_unlock(ds_queue); + return -EAGAIN; + } + + return ret; +} + /** * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split @@ -3747,7 +3893,6 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL * @split_type: perform uniform split or not (non-uniform split) - * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3763,9 +3908,8 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, enum split_type split_type, bool unmapped) + struct list_head *list, enum split_type split_type) { - struct deferred_split *ds_queue; XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); @@ -3776,7 +3920,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, int nr_shmem_dropped = 0; int remap_flags = 0; int extra_pins, ret; - pgoff_t end; + pgoff_t end = 0; bool is_hzp; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); @@ -3819,14 +3963,12 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * is taken to serialise against parallel split or collapse * operations. */ - if (!unmapped) { - anon_vma = folio_get_anon_vma(folio); - if (!anon_vma) { - ret = -EBUSY; - goto out; - } - anon_vma_lock_write(anon_vma); + anon_vma = folio_get_anon_vma(folio); + if (!anon_vma) { + ret = -EBUSY; + goto out; } + anon_vma_lock_write(anon_vma); mapping = NULL; } else { unsigned int min_order; @@ -3880,8 +4022,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out_unlock; } - if (!unmapped) - unmap_folio(folio); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -3898,142 +4039,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - /* Prevent deferred_split_scan() touching ->_refcount */ - ds_queue = folio_split_queue_lock(folio); - if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct swap_cluster_info *ci = NULL; - struct lruvec *lruvec; - int expected_refs; - - if (old_order > 1) { - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(old_order, - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - } - split_queue_unlock(ds_queue); - if (mapping) { - int nr = folio_nr_pages(folio); - - if (folio_test_pmd_mappable(folio) && - new_order < HPAGE_PMD_ORDER) { - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, - NR_SHMEM_THPS, -nr); - } else { - __lruvec_stat_mod_folio(folio, - NR_FILE_THPS, -nr); - filemap_nr_thps_dec(mapping); - } - } - } - - if (folio_test_swapcache(folio)) { - if (mapping) { - VM_WARN_ON_ONCE_FOLIO(mapping, folio); - ret = -EINVAL; - goto fail; - } - - ci = swap_cluster_get_and_lock(folio); - } - - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - - ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, split_type); - - /* - * Unfreeze after-split folios and put them back to the right - * list. @folio should be kept frozon until page cache - * entries are updated with all the other after-split folios - * to prevent others seeing stale page cache entries. - * As a result, new_folio starts from the next folio of - * @folio. - */ - for (new_folio = folio_next(folio); new_folio != end_folio; - new_folio = next) { - unsigned long nr_pages = folio_nr_pages(new_folio); - - next = folio_next(new_folio); - - zone_device_private_split_cb(folio, new_folio); - - expected_refs = folio_expected_ref_count(new_folio) + 1; - folio_ref_unfreeze(new_folio, expected_refs); - - if (!unmapped) - lru_add_split_folio(folio, new_folio, lruvec, list); - - /* - * Anonymous folio with swap cache. - * NOTE: shmem in swap cache is not supported yet. - */ - if (ci) { - __swap_cache_replace_folio(ci, folio, new_folio); - continue; - } - - /* Anonymous folio without swap cache */ - if (!mapping) - continue; - - /* Add the new folio to the page cache. */ - if (new_folio->index < end) { - __xa_store(&mapping->i_pages, new_folio->index, - new_folio, 0); - continue; - } - - /* Drop folio beyond EOF: ->index >= end */ - if (shmem_mapping(mapping)) - nr_shmem_dropped += nr_pages; - else if (folio_test_clear_dirty(new_folio)) - folio_account_cleaned( - new_folio, inode_to_wb(mapping->host)); - __filemap_remove_folio(new_folio, NULL); - folio_put_refs(new_folio, nr_pages); - } - - zone_device_private_split_cb(folio, NULL); - /* - * Unfreeze @folio only after all page cache entries, which - * used to point to it, have been updated with new folios. - * Otherwise, a parallel folio_try_get() can grab @folio - * and its caller can see stale page cache entries. - */ - expected_refs = folio_expected_ref_count(folio) + 1; - folio_ref_unfreeze(folio, expected_refs); - - unlock_page_lruvec(lruvec); - - if (ci) - swap_cluster_unlock(ci); - } else { - split_queue_unlock(ds_queue); - ret = -EAGAIN; - } + ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, + true, list, split_type, end, &nr_shmem_dropped, + extra_pins); fail: if (mapping) xas_unlock(&xas); local_irq_enable(); - if (unmapped) - return ret; - if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); @@ -4077,6 +4091,48 @@ out: return ret; } +/** + * folio_split_unmapped() - split a large anon folio that is already unmapped + * @folio: folio to split + * @new_order: the order of folios after split + * + * This function is a helper for splitting folios that have already been + * unmapped. The use case is that the device or the CPU can refuse to migrate + * THP pages in the middle of migration, due to allocation issues on either + * side. + * + * anon_vma_lock is not required to be held, mmap_read_lock() or + * mmap_write_lock() should be held. @folio is expected to be locked by the + * caller. device-private and non device-private folios are supported along + * with folios that are in the swapcache. @folio should also be unmapped and + * isolated from LRU (if applicable) + * + * Upon return, the folio is not remapped, split folios are not added to LRU, + * free_folio_and_swap_cache() is not called, and new folios remain locked. + * + * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to + * insufficient reference count or extra pins). + */ +int folio_split_unmapped(struct folio *folio, unsigned int new_order) +{ + int extra_pins, ret = 0; + + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio); + + if (!can_split_folio(folio, 1, &extra_pins)) + return -EAGAIN; + + local_irq_disable(); + ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, + NULL, false, NULL, SPLIT_TYPE_UNIFORM, + 0, NULL, extra_pins); + local_irq_enable(); + return ret; +} + /* * This function splits a large folio into smaller folios of order @new_order. * @page can point to any page of the large folio to split. The split operation @@ -4125,12 +4181,12 @@ out: * with the folio. Splitting to order 0 is compatible with all folios. */ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped) + unsigned int new_order) { struct folio *folio = page_folio(page); return __folio_split(folio, new_order, &folio->page, page, list, - SPLIT_TYPE_UNIFORM, unmapped); + SPLIT_TYPE_UNIFORM); } /** @@ -4161,7 +4217,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - SPLIT_TYPE_NON_UNIFORM, false); + SPLIT_TYPE_NON_UNIFORM); } int min_order_for_split(struct folio *folio) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b1ce6e3478d6..23379663b1e1 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -916,8 +916,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, folio_get(folio); split_huge_pmd_address(migrate->vma, addr, true); - ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, - 0, true); + ret = folio_split_unmapped(folio, 0); if (ret) return ret; migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; From 218fbfad16341ae60b7d540ca65af016b9f83500 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Mon, 17 Nov 2025 23:40:11 +0800 Subject: [PATCH 278/321] selftests/mm: gup_test: stop testing FOLL_TOUCH commit 0f20bba1688b ("mm/gup: explicitly define and check internal GUP flags, disallow FOLL_TOUCH") marked FOLL_TOUCH as a GUP-internal flag. This causes a warning to fire when running gup_test, for example: $ ./gup_test -L -r 100 -z dmesg: WARNING: CPU: 1 PID: 117 at mm/gup.c:2512 is_valid_gup_args+0x66/0x8c Therefore, remove the "FOLL_TOUCH" test code from gup_test.c. Link: https://lkml.kernel.org/r/20251117154012.197499-1-peng8420.li@gmail.com Signed-off-by: Peng Li Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand (Red Hat) Cc: Dan Williams Cc: Jason Gunthorpe Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 8900b840c17a..75f7134d529d 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -19,7 +19,6 @@ /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ #define GUP_TEST_FILE "/sys/kernel/debug/gup_test" @@ -93,7 +92,7 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; + int flags = MAP_PRIVATE; char *file = "/dev/zero"; pthread_t *tid; char *p; @@ -170,10 +169,6 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; default: ksft_exit_fail_msg("Wrong argument\n"); } @@ -244,18 +239,9 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += psize()) - p[0] = 0; - } + /* Fault them in here, from user space. */ + for (; (unsigned long)p < gup.addr + size; p += psize()) + p[0] = 0; tid = malloc(sizeof(pthread_t) * nthreads); assert(tid); From 3e700b715e1cef66371027cefd3761eb8fa6e0d8 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Mon, 17 Nov 2025 23:40:12 +0800 Subject: [PATCH 279/321] selftests/mm: gup_test: fix comment regarding origin of FOLL_WRITE The 'FOLL_WRITE' of the copied source is located in mm_types.h of mm, not mm.h, so fix it. Link: https://lkml.kernel.org/r/20251117154012.197499-2-peng8420.li@gmail.com Signed-off-by: Peng Li Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand (Red Hat) Cc: Dan Williams Cc: Jason Gunthorpe Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 75f7134d529d..40c1538a17b4 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -17,7 +17,7 @@ #define MB (1UL << 20) -/* Just the flags we need, copied from mm.h: */ +/* Just the flags we need, copied from the kernel internals. */ #define FOLL_WRITE 0x01 /* check pte is writable */ #define GUP_TEST_FILE "/sys/kernel/debug/gup_test" From 7e44d00a13ca5691caf4f7c46541ee60bf75b208 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:05 -0800 Subject: [PATCH 280/321] memcg: use mod_node_page_state to update stats Patch series "memcg: cleanup the memcg stats interfaces". The memcg stats are safe against irq (and nmi) context and thus does not require disabling irqs. However for some stats which are also maintained at node level, it is using irq unsafe interface and thus requiring the users to still disables irqs or use interfaces which explicitly disables irqs. Let's move memcg code to use irq safe node level stats function which is already optimized for architectures with HAVE_CMPXCHG_LOCAL (all major ones), so there will not be any performance penalty for its usage. This patch (of 4): The memcg stats are safe against irq (and nmi) context and thus does not require disabling irqs. However some code paths for memcg stats also update the node level stats and use irq unsafe interface and thus require the users to disable irqs. However node level stats, on architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface which does not require irq disabling. Let's move memcg stats code to start using that interface for node level stats. Link: https://lkml.kernel.org/r/20251110232008.1352063-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20251110232008.1352063-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- include/linux/vmstat.h | 4 ++-- mm/memcontrol.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc6db20d7dca..1085d0460e66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1408,7 +1408,7 @@ static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, { struct page *page = virt_to_head_page(p); - __mod_node_page_state(page_pgdat(page), idx, val); + mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c287998908bf..11a37aaa4dd9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -557,7 +557,7 @@ static inline void mod_lruvec_page_state(struct page *page, static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, @@ -569,7 +569,7 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - __mod_node_page_state(folio_pgdat(folio), idx, val); + mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void lruvec_stat_mod_folio(struct folio *folio, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 623446821b00..7e6407b8bfb7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -770,7 +770,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { /* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) @@ -789,7 +789,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); - __mod_node_page_state(pgdat, idx, val); + mod_node_page_state(pgdat, idx, val); return; } @@ -815,7 +815,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) * vmstats to keep it correct for the root memcg. */ if (!memcg) { - __mod_node_page_state(pgdat, idx, val); + mod_node_page_state(pgdat, idx, val); } else { lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); From 469241fe7657dbec9e2948287ab7412955d8b73a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:06 -0800 Subject: [PATCH 281/321] memcg: remove __mod_lruvec_kmem_state __mod_lruvec_kmem_state() is already safe against irqs, so there is no need to have a separate interface (i.e. mod_lruvec_kmem_state) which wraps calls to it with irq disabling and reenabling. Let's rename __mod_lruvec_kmem_state() to mod_lruvec_kmem_state(). Link: https://lkml.kernel.org/r/20251110232008.1352063-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Reviewed-by: Qi Zheng Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 28 +++++----------------------- mm/memcontrol.c | 2 +- mm/workingset.c | 2 +- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1085d0460e66..d35390f9892a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -957,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); - -static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_kmem_state(p, idx, val); - local_irq_restore(flags); -} +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1403,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } -static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - struct page *page = virt_to_head_page(p); - - mod_node_page_state(page_pgdat(page), idx, val); -} - static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1470,14 +1452,14 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) +static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_kmem_state(p, idx, 1); + mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) +static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_kmem_state(p, idx, -1); + mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e6407b8bfb7..ae154f51931e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -799,7 +799,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, } EXPORT_SYMBOL(__lruvec_stat_mod_folio); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 68a76a91111f..6ff30369b758 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); From 5b3eb779a20cf30d74bb346d2a1e525bc9072685 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:07 -0800 Subject: [PATCH 282/321] memcg: remove __mod_lruvec_state __mod_lruvec_state() is already safe against irqs, so there is no need to have a separate interface (i.e. mod_lruvec_state) which wraps calls to it with irq disabling and reenabling. Let's rename __mod_lruvec_state() to mod_lruvec_state(). Link: https://lkml.kernel.org/r/20251110232008.1352063-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- include/linux/vmstat.h | 18 +----------------- mm/memcontrol.c | 8 ++++---- mm/migrate.c | 20 ++++++++++---------- mm/vmscan.c | 4 ++-- 5 files changed, 18 insertions(+), 34 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ca7a18351797..b58f34c4fe92 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 11a37aaa4dd9..4eb7753e6e5c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -520,19 +520,9 @@ static inline const char *vm_event_name(enum vm_event_item item) #ifdef CONFIG_MEMCG -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_state(lruvec, idx, val); - local_irq_restore(flags); -} - void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); @@ -554,12 +544,6 @@ static inline void mod_lruvec_page_state(struct page *page, #else -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae154f51931e..9a659f16af77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -757,7 +757,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, } /** - * __mod_lruvec_state - update lruvec memory statistics + * mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec * @idx: the stat item * @val: delta to add to the counter, can be negative @@ -766,7 +766,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, * function updates the all three counters that are affected by a * change of state at this level: per-node, per-cgroup, per-lruvec. */ -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { /* Update node */ @@ -794,7 +794,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, } lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_lruvec_state(lruvec, idx, val); + mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } EXPORT_SYMBOL(__lruvec_stat_mod_folio); @@ -818,7 +818,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) mod_node_page_state(pgdat, idx, val); } else { lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_lruvec_state(lruvec, idx, val); + mod_lruvec_state(lruvec, idx, val); } rcu_read_unlock(); } diff --git a/mm/migrate.c b/mm/migrate.c index b2ad78bf85d5..5169f9717f60 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -675,27 +675,27 @@ static int __folio_migrate_mapping(struct address_space *mapping, old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); - __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); - __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); + mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { - __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); - __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); + mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + mod_lruvec_state(new_lruvec, NR_SHMEM, nr); if (folio_test_pmd_mappable(folio)) { - __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); - __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); + mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); + mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); } } #ifdef CONFIG_SWAP if (folio_test_swapcache(folio)) { - __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); - __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } #endif if (dirty && mapping_can_writeback(mapping)) { - __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); - __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 51ffd32e6e01..720772baf2a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2018,7 +2018,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); @@ -4744,7 +4744,7 @@ retry: reset_batch_size(walk); } - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); From c1bd09994c4d5b897571671bed16581335e93242 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:08 -0800 Subject: [PATCH 283/321] memcg: remove __lruvec_stat_mod_folio __lruvec_stat_mod_folio() is already safe against irqs, so there is no need to have a separate interface (i.e. lruvec_stat_mod_folio) which wraps calls to it with irq disabling and reenabling. Let's rename __lruvec_stat_mod_folio() to lruvec_stat_mod_folio(). Link: https://lkml.kernel.org/r/20251110232008.1352063-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 30 +----------------------------- mm/filemap.c | 20 ++++++++++---------- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 8 ++++---- mm/memcontrol.c | 4 ++-- mm/page-writeback.c | 2 +- mm/rmap.c | 4 ++-- mm/shmem.c | 6 +++--- 8 files changed, 25 insertions(+), 53 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 4eb7753e6e5c..3398a345bda8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -523,19 +523,9 @@ static inline const char *vm_event_name(enum vm_event_item item) void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __lruvec_stat_mod_folio(struct folio *folio, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __lruvec_stat_mod_folio(folio, idx, val); - local_irq_restore(flags); -} - static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -550,12 +540,6 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_node_page_state(folio_pgdat(folio), idx, val); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -570,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_add_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); -} - -static inline void __lruvec_stat_sub_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { diff --git a/mm/filemap.c b/mm/filemap.c index 07634b7d9934..7d15a9c216ef 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -182,13 +182,13 @@ static void filemap_unaccount_folio(struct address_space *mapping, nr = folio_nr_pages(folio); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) @@ -844,13 +844,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) - __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) - __lruvec_stat_add_folio(new, NR_FILE_PAGES); + lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) - __lruvec_stat_sub_folio(old, NR_SHMEM); + lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) - __lruvec_stat_add_folio(new, NR_SHMEM); + lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); @@ -933,9 +933,9 @@ noinline int __filemap_add_folio(struct address_space *mapping, /* hugetlb pages do not participate in page cache accounting */ if (!huge) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 53a8d380eab2..7af3e037d891 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3783,10 +3783,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n if (folio_test_pmd_mappable(folio) && new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40f9d5939aa5..89c33ef7aac3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2195,14 +2195,14 @@ immap_locked: } if (is_shmem) - __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); + lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a659f16af77..9b07db2cb232 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -777,7 +777,7 @@ void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, mod_memcg_lruvec_state(lruvec, idx, val); } -void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { struct mem_cgroup *memcg; @@ -797,7 +797,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } -EXPORT_SYMBOL(__lruvec_stat_mod_folio); +EXPORT_SYMBOL(lruvec_stat_mod_folio); void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..d6b339cc876d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2658,7 +2658,7 @@ static void folio_account_dirtied(struct folio *folio, inode_attach_wb(inode, folio); wb = inode_to_wb(inode); - __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); diff --git a/mm/rmap.c b/mm/rmap.c index d871f2eb821c..f955f02d570e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1212,12 +1212,12 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, nr); + lruvec_stat_mod_folio(folio, idx, nr); } if (nr_pmdmapped) { if (folio_test_anon(folio)) { idx = NR_ANON_THPS; - __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); } else { /* NR_*_PMDMAPPED are not maintained per-memcg */ idx = folio_test_swapbacked(folio) ? diff --git a/mm/shmem.c b/mm/shmem.c index fc835b3e4914..ad18172ff831 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -871,9 +871,9 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* From 07003531e03c864b0711757c8009c7f14c95d1d1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 21 Nov 2025 01:44:02 -0800 Subject: [PATCH 284/321] mm/vmalloc: warn on invalid vmalloc gfp flags Patch series "make vmalloc gfp flags usage more apparent", v4. We should do a better job at enforcing gfp flags for vmalloc. Right now, we have a kernel-doc for __vmalloc_node_range(), and hope callers pass in supported flags. If a caller were to pass in an unsupported flag, we may BUG, silently clear it, or completely ignore it. If we are more proactive about enforcing gfp flags, we can making sure callers know when they may be asking for unsupported behavior. This patchset lets vmalloc control the incoming gfp flags, and cleans up some hard to read gfp code. This patch (of 4): Vmalloc explicitly supports a list of flags, but we never enforce them. vmalloc has been trying to handle unsupported flags by clearing and setting flags wherever necessary. This is messy and makes the code harder to understand, when we could simply check for a supported input immediately instead. Define a helper mask and function telling callers they have passed in invalid flags, and clear those unsupported vmalloc flags. Link: https://lkml.kernel.org/r/20251121094405.40628-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20251121094405.40628-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: "Uladzislau Rezki (Sony)" Acked-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/vmalloc.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0832f944544c..c97c874b6666 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3911,6 +3911,28 @@ fail: return NULL; } +/* + * See __vmalloc_node_range() for a clear list of supported vmalloc flags. + * This gfp lists all flags currently passed through vmalloc. Currently, + * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm + * and BPF also use GFP_USER. Additionally, various users pass + * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP. + */ +#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\ + __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\ + GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ + GFP_USER | __GFP_NOLOCKDEP) + +static gfp_t vmalloc_fix_flags(gfp_t flags) +{ + gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED; + + flags &= GFP_VMALLOC_SUPPORTED; + WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + return flags; +} + /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size @@ -4092,6 +4114,8 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { + if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED)) + gfp_mask = vmalloc_fix_flags(gfp_mask); return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } @@ -4131,6 +4155,8 @@ EXPORT_SYMBOL(vmalloc_noprof); */ void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { + if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED)) + gfp_mask = vmalloc_fix_flags(gfp_mask); return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); From bb4d3c76860d555aa4cd4b2da44111057066402d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 21 Nov 2025 01:44:03 -0800 Subject: [PATCH 285/321] mm/vmalloc: add a helper to optimize vmalloc allocation gfps vm_area_alloc_pages() attempts to use different gfp flags as a way to optimize allocations. This has been done inline which makes things harder to read. Add a helper function to make the code more readable. Link: https://lkml.kernel.org/r/20251121094405.40628-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: "Uladzislau Rezki (Sony)" Acked-by: SeongJae Park Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c97c874b6666..185add3f5606 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3614,6 +3614,17 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ +/* + * Helper for vmalloc to adjust the gfp flags for certain allocations. + */ +static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large) +{ + flags |= __GFP_NOWARN; + if (large) + flags &= ~__GFP_NOFAIL; + return flags; +} + static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) @@ -3852,9 +3863,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * Please note, the __vmalloc_node_range_noprof() falls-back * to order-0 pages if high-order attempt is unsuccessful. */ - area->nr_pages = vm_area_alloc_pages((page_order ? - gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, - node, page_order, nr_small_pages, area->pages); + area->nr_pages = vm_area_alloc_pages( + vmalloc_gfp_adjust(gfp_mask, page_order), node, + page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); /* All pages of vm should be charged to same memcg, so use first one. */ From 75f20b17440fdaee33a6de2efe3c162e6b19e18e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 21 Nov 2025 01:44:04 -0800 Subject: [PATCH 286/321] mm/vmalloc: cleanup large_gfp in vm_area_alloc_pages() Now that we have already checked for unsupported flags, we can use the helper function to set the necessary gfp flags for the large order allocation optimization. Link: https://lkml.kernel.org/r/20251121094405.40628-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: "Uladzislau Rezki (Sony)" Acked-by: SeongJae Park Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 185add3f5606..134f615fd3bd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3634,10 +3634,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int max_attempt_order = MAX_PAGE_ORDER; struct page *page; int i; - gfp_t large_gfp = (gfp & - ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL | __GFP_COMP)) - | __GFP_NOWARN; unsigned int large_order = ilog2(nr_remaining); + gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM; large_order = min(max_attempt_order, large_order); From d85b653f2c469285b760558bccbee4a45e47f3e2 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 21 Nov 2025 01:44:05 -0800 Subject: [PATCH 287/321] mm/vmalloc: cleanup gfp flag use in new_vmap_block() The only caller, vb_alloc(), passes GFP_KERNEL into new_vmap_block() which is a subset of GFP_RECLAIM_MASK. Since there's no reason to use this mask here, remove it. Link: https://lkml.kernel.org/r/20251121094405.40628-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: "Uladzislau Rezki (Sony)" Acked-by: SeongJae Park Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 134f615fd3bd..ecbac900c35f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2699,8 +2699,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) node = numa_node_id(); - vb = kmalloc_node(sizeof(struct vmap_block), - gfp_mask & GFP_RECLAIM_MASK, node); + vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); From 277a1ae3879a82a15a2e2d6741e38e31ea6487ee Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:01 +0800 Subject: [PATCH 288/321] mm: softdirty: add pgtable_supports_soft_dirty() Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15. This patchset adds support for Svrsw60t59b [1] extension which is ratified now, also add soft dirty and userfaultfd write protect tracking for RISC-V. The patches 1 and 2 add macros to allow architectures to define their own checks if the soft-dirty / uffd_wp PTE bits are available, in other words for RISC-V, the Svrsw60t59b extension is supported on which device the kernel is running. Also patch1-2 are removing "ifdef CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by the architecture, no change in behavior is expected. This patchset has been tested with kselftest mm suite in which soft-dirty, madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and no regressions are observed in any of the other tests. This patch (of 6): Some platforms can customize the PTE PMD entry soft-dirty bit making it unavailable even if the architecture provides the resource. Add an API which architectures can define their specific implementations to detect if soft-dirty bit is available on which device the kernel is running. This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of pgtable_supports_soft_dirty() checks that defaults to IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture, no change in behavior is expected. We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(), so we will never run into VM_SOFTDIRTY checks. [lorenzo.stoakes@oracle.com: fix VMA selftests] Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1] Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Cc: Alexandre Ghiti Cc: Andrew Jones Cc: Conor Dooley Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 15 ++++++--------- include/linux/mm.h | 3 +++ include/linux/pgtable.h | 12 ++++++++++++ mm/debug_vm_pgtable.c | 10 +++++----- mm/huge_memory.c | 13 +++++++------ mm/internal.h | 2 +- mm/mmap.c | 6 ++++-- mm/mremap.c | 13 +++++++------ mm/userfaultfd.c | 10 ++++------ mm/vma.c | 6 ++++-- mm/vma_exec.c | 5 ++++- tools/testing/vma/vma_internal.h | 2 ++ 12 files changed, 59 insertions(+), 38 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 41b062ce6ad8..2b4ab5718ab5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1584,8 +1584,6 @@ struct clear_refs_private { enum clear_refs_types type; }; -#ifdef CONFIG_MEM_SOFT_DIRTY - static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; @@ -1605,6 +1603,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + if (!pgtable_supports_soft_dirty()) + return; /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the @@ -1630,19 +1630,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, addr, pte, ptent); } } -#else -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} -#endif -#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + if (!pgtable_supports_soft_dirty()) + return; + if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); diff --git a/include/linux/mm.h b/include/linux/mm.h index bf660d5b6e97..75f894c3f521 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -859,6 +859,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); ACCESS_PRIVATE(vma, __vm_flags) = flags; } @@ -870,6 +871,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma, static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_assert_write_locked(vma); vm_flags_init(vma, flags); } @@ -891,6 +893,7 @@ static inline void vm_flags_set(struct vm_area_struct *vma, static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..b13b6f42be3c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1553,6 +1553,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * Some platforms can customize the PTE soft-dirty bit making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY + * is part of this macro. + */ +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1eae87dbef73..ae9b9310d96f 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -704,7 +704,7 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; pr_debug("Validating PTE soft dirty\n"); @@ -717,7 +717,7 @@ static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) pte_t pte; softleaf_t entry; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; pr_debug("Validating PTE swap soft dirty\n"); @@ -734,7 +734,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; if (!has_transparent_hugepage()) @@ -750,8 +750,8 @@ static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) || - !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) + if (!pgtable_supports_soft_dirty() || + !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) return; if (!has_transparent_hugepage()) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7af3e037d891..041b554c7115 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2427,12 +2427,13 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, static pmd_t move_soft_dirty_pmd(pmd_t pmd) { -#ifdef CONFIG_MEM_SOFT_DIRTY - if (unlikely(pmd_is_migration_entry(pmd))) - pmd = pmd_swp_mksoft_dirty(pmd); - else if (pmd_present(pmd)) - pmd = pmd_mksoft_dirty(pmd); -#endif + if (pgtable_supports_soft_dirty()) { + if (unlikely(pmd_is_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); + } + return pmd; } diff --git a/mm/internal.h b/mm/internal.h index 929bc4a5dd98..04c307ee33ae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1554,7 +1554,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return false; /* diff --git a/mm/mmap.c b/mm/mmap.c index dc51680824ec..4bdb9ffa9e25 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1448,8 +1448,10 @@ static struct vm_area_struct *__install_special_mapping( return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); - vm_flags_init(vma, (vm_flags | mm->def_flags | - VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); + vm_flags |= mm->def_flags | VM_DONTEXPAND; + if (pgtable_supports_soft_dirty()) + vm_flags |= VM_SOFTDIRTY; + vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mremap.c b/mm/mremap.c index fdb0485ede74..672264807db6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -165,12 +165,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) * Set soft dirty bit so we can notice * in userspace the ptes were moved. */ -#ifdef CONFIG_MEM_SOFT_DIRTY - if (pte_present(pte)) - pte = pte_mksoft_dirty(pte); - else - pte = pte_swp_mksoft_dirty(pte); -#endif + if (pgtable_supports_soft_dirty()) { + if (pte_present(pte)) + pte = pte_mksoft_dirty(pte); + else + pte = pte_swp_mksoft_dirty(pte); + } + return pte; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bd1f74a7a5ac..e6dfd5f28acd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1119,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm, orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Set soft dirty bit so userspace can notice the pte was moved */ -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); if (pte_dirty(orig_src_pte)) orig_dst_pte = pte_mkdirty(orig_dst_pte); orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); @@ -1208,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); diff --git a/mm/vma.c b/mm/vma.c index 4e21c988054d..fc90befd162f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2559,7 +2559,8 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * then new mapped in-place (which must be aimed as * a completely new data area). */ - vm_flags_set(vma, VM_SOFTDIRTY); + if (pgtable_supports_soft_dirty()) + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } @@ -2864,7 +2865,8 @@ out: mm->data_vm += len >> PAGE_SHIFT; if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); + if (pgtable_supports_soft_dirty()) + vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 922ee51747a6..8134e1afca68 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -107,6 +107,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p) { + unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; int err; struct vm_area_struct *vma = vm_area_alloc(mm); @@ -137,7 +138,9 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); + if (pgtable_supports_soft_dirty()) + flags |= VM_SOFTDIRTY; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 81b501f51948..be99056c5d56 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -212,6 +212,8 @@ typedef __bitwise unsigned int vm_fault_t; #define ASSERT_EXCLUSIVE_WRITER(x) +#define pgtable_supports_soft_dirty() 1 + /** * swap - swap values of @a and @b * @a: first value From f59c0924d61aa2a2bb85936a593140f327112787 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:02 +0800 Subject: [PATCH 289/321] mm: userfaultfd: add pgtable_supports_uffd_wp() Some platforms can customize the PTE/PMD entry uffd-wp bit making it unavailable even if the architecture provides the resource. This patch adds a macro API pgtable_supports_uffd_wp() that allows architectures to define their specific implementations to check if the uffd-wp bit is available on which device the kernel is running. Also this patch is removing "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of pgtable_supports_uffd_wp() and uffd_supports_wp_marker() checks respectively that default to IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) and "IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP)" if not overridden by the architecture, no change in behavior is expected. Link: https://lkml.kernel.org/r/20251113072806.795029-3-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Al Viro Cc: Andrew Jones Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 22 +++++----- include/asm-generic/pgtable_uffd.h | 17 ++++++++ include/linux/mm_inline.h | 8 ++-- include/linux/userfaultfd_k.h | 69 ++++++++++++++++++------------ mm/memory.c | 6 ++- 5 files changed, 78 insertions(+), 44 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3f539aabc3b3..5a0d19dec7ba 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1289,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - goto out; -#endif + if (!pgtable_supports_uffd_wp()) + goto out; + vm_flags |= VM_UFFD_WP; } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { @@ -1999,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; -#endif -#ifndef CONFIG_PTE_MARKER_UFFD_WP - uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; - uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; - uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; -#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } ret = -EINVAL; if (features & ~uffdio_api.features) diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h index 828966d4c281..0d85791efdf7 100644 --- a/include/asm-generic/pgtable_uffd.h +++ b/include/asm-generic/pgtable_uffd.h @@ -1,6 +1,23 @@ #ifndef _ASM_GENERIC_PGTABLE_UFFD_H #define _ASM_GENERIC_PGTABLE_UFFD_H +/* + * Some platforms can customize the uffd-wp bit, making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the + * CONFIG_HAVE_ARCH_USERFAULTFD_WP is part of this macro. + */ +#ifndef pgtable_supports_uffd_wp +#define pgtable_supports_uffd_wp() IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) +#endif + +static inline bool uffd_supports_wp_marker(void) +{ + return pgtable_supports_uffd_wp() && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP); +} + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP static __always_inline int pte_uffd_wp(pte_t pte) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b58f34c4fe92..fa2d6ba811b5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker( return dstm; } -#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to @@ -571,9 +570,11 @@ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; + if (!uffd_supports_wp_marker()) + return false; + /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); @@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } -#endif + return false; } @@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +#endif /** * num_pages_contiguous() - determine the number of contiguous pages diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 96b089dff4ef..fd5f42765497 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if (wp_async && (vm_flags == VM_UFFD_WP)) return true; -#ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) return false; -#endif /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || @@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ + if (!uffd_supports_wp_marker()) + return false; + + if (pte_present(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_is_uffd_wp_marker(pte)) + return true; + + return false; +} #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -415,23 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) return false; } -#endif /* CONFIG_USERFAULTFD */ - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { - /* Only wr-protect mode uses pte markers */ - if (!userfaultfd_wp(vma)) - return false; - - /* File-based uffd-wp always need markers */ - if (!vma_is_anonymous(vma)) - return true; - - /* - * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED - * enabled (to apply markers on zero pages). - */ - return userfaultfd_wp_unpopulated(vma); + return false; } /* @@ -440,16 +462,7 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP - if (pte_present(pte)) - return false; - if (pte_swp_uffd_wp(pte)) - return true; - - if (pte_is_uffd_wp_marker(pte)) - return true; -#endif return false; } - +#endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/memory.c b/mm/memory.c index 50b93b45b174..6675e87eb7dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1590,7 +1590,9 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, { bool was_installed = false; -#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (!uffd_supports_wp_marker()) + return false; + /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return false; @@ -1607,7 +1609,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, pte++; addr += PAGE_SIZE; } -#endif + return was_installed; } From 59f6acb4be0209b1451ac186c6d7d4175889c949 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:03 +0800 Subject: [PATCH 290/321] riscv: add RISC-V Svrsw60t59b extension support The Svrsw60t59b extension allows to free the PTE reserved bits 60 and 59 for software to use. Link: https://lkml.kernel.org/r/20251113072806.795029-4-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Deepak Gupta Cc: Albert Ou Cc: Alexandre Ghiti Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Conor Dooley Cc: David Hildenbrand Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 14 ++++++++++++++ arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/kernel/cpufeature.c | 1 + 3 files changed, 16 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 22cda9c452d2..829d95d173cf 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -849,6 +849,20 @@ config RISCV_ISA_ZICBOP If you don't know what to do here, say Y. +config RISCV_ISA_SVRSW60T59B + bool "Svrsw60t59b extension support for using PTE bits 60 and 59" + depends on MMU && 64BIT + depends on RISCV_ALTERNATIVE + default y + help + Adds support to dynamically detect the presence of the Svrsw60t59b + extension and enable its usage. + + The Svrsw60t59b extension allows to free the PTE reserved bits 60 + and 59 for software to use. + + If you don't know what to do here, say Y. + config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI def_bool y # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=aed44286efa8ae8717a77d94b51ac3614e2ca6dc diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index affd63e11b0a..f98fcb5c17d5 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -106,6 +106,7 @@ #define RISCV_ISA_EXT_ZAAMO 97 #define RISCV_ISA_EXT_ZALRSC 98 #define RISCV_ISA_EXT_ZICBOP 99 +#define RISCV_ISA_EXT_SVRSW60T59B 100 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 72ca768f4e91..5441282656a7 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -539,6 +539,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), + __RISCV_ISA_EXT_DATA(svrsw60t59b, RISCV_ISA_EXT_SVRSW60T59B), __RISCV_ISA_EXT_DATA(svvptc, RISCV_ISA_EXT_SVVPTC), }; From 2a3ebad4db63e86a9443d6bff4a5977320dc09f6 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:04 +0800 Subject: [PATCH 291/321] riscv: mm: add soft-dirty page tracking support The Svrsw60t59b extension allows to free the PTE reserved bits 60 and 59 for software, this patch uses bit 59 for soft-dirty. To add swap PTE soft-dirty tracking, we borrow bit 3 which is available for swap PTEs on RISC-V systems. Link: https://lkml.kernel.org/r/20251113072806.795029-5-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Reviewed-by: Deepak Gupta Cc: Albert Ou Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Al Viro Cc: Andrew Jones Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Conor Dooley Cc: David Hildenbrand Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable-bits.h | 19 +++++++ arch/riscv/include/asm/pgtable.h | 75 ++++++++++++++++++++++++++- 3 files changed, 93 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 829d95d173cf..e5f070485bbe 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -142,6 +142,7 @@ config RISCV select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SOFT_DIRTY if 64BIT && MMU && RISCV_ISA_SVRSW60T59B select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 179bd4afece4..f3bac2bbc157 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -19,6 +19,25 @@ #define _PAGE_SOFT (3 << 8) /* Reserved for software */ #define _PAGE_SPECIAL (1 << 8) /* RSW: 0x1 */ + +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* ext_svrsw60t59b: bit 59 for soft-dirty tracking */ +#define _PAGE_SOFT_DIRTY \ + ((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \ + (1UL << 59) : 0) +/* + * Bit 3 is always zero for swap entry computation, so we + * can borrow it for swap page soft-dirty tracking. + */ +#define _PAGE_SWP_SOFT_DIRTY \ + ((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \ + _PAGE_EXEC : 0) +#else +#define _PAGE_SOFT_DIRTY 0 +#define _PAGE_SWP_SOFT_DIRTY 0 +#endif /* CONFIG_MEM_SOFT_DIRTY */ + #define _PAGE_TABLE _PAGE_PRESENT /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 5a08eb5fe99f..049ba0e64f94 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -428,7 +428,7 @@ static inline pte_t pte_mkwrite_novma(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_DIRTY); + return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkclean(pte_t pte) @@ -456,6 +456,42 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#define pgtable_supports_soft_dirty() \ + (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && \ + riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) + +static inline bool pte_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY)); +} + +static inline bool pte_swp_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); +} + +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY)); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + #ifdef CONFIG_RISCV_ISA_SVNAPOT #define pte_leaf_size(pte) (pte_napot(pte) ? \ napot_cont_size(napot_cont_order(pte)) :\ @@ -805,6 +841,40 @@ static inline pud_t pud_mkspecial(pud_t pud) } #endif +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline bool pmd_soft_dirty(pmd_t pmd) +{ + return pte_soft_dirty(pmd_pte(pmd)); +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd))); +} + +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) +{ + return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd))); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static inline bool pmd_swp_soft_dirty(pmd_t pmd) +{ + return pte_swp_soft_dirty(pmd_pte(pmd)); +} + +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) +{ + return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd))); +} + +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) +{ + return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd))); +} +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { @@ -1003,7 +1073,8 @@ static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) - * bit 1 to 3: _PAGE_LEAF (zero) + * bit 1 to 2: (zero) + * bit 3: _PAGE_SWP_SOFT_DIRTY * bit 5: _PAGE_PROT_NONE (zero) * bit 6: exclusive marker * bits 7 to 11: swap type From c64da3950cf45d5fd87d7754ab4698b8cec01cae Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:05 +0800 Subject: [PATCH 292/321] riscv: mm: add userfaultfd write-protect support The Svrsw60t59b extension allows to free the PTE reserved bits 60 and 59 for software, this patch uses bit 60 for uffd-wp tracking Additionally for tracking the uffd-wp state as a PTE swap bit, we borrow bit 4 which is not involved into swap entry computation. Link: https://lkml.kernel.org/r/20251113072806.795029-6-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Cc: Albert Ou Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Al Viro Cc: Andrew Jones Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Conor Dooley Cc: David Hildenbrand Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable-bits.h | 18 +++++++ arch/riscv/include/asm/pgtable.h | 68 +++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e5f070485bbe..b2eff4789fe2 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -148,6 +148,7 @@ config RISCV select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if 64BIT && MMU select HAVE_ARCH_USERFAULTFD_MINOR if 64BIT && USERFAULTFD + select HAVE_ARCH_USERFAULTFD_WP if 64BIT && MMU && USERFAULTFD && RISCV_ISA_SVRSW60T59B select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS select HAVE_CONTEXT_TRACKING_USER diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index f3bac2bbc157..b422d9691e60 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -38,6 +38,24 @@ #define _PAGE_SWP_SOFT_DIRTY 0 #endif /* CONFIG_MEM_SOFT_DIRTY */ +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP + +/* ext_svrsw60t59b: Bit(60) for uffd-wp tracking */ +#define _PAGE_UFFD_WP \ + ((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \ + (1UL << 60) : 0) +/* + * Bit 4 is not involved into swap entry computation, so we + * can borrow it for swap page uffd-wp tracking. + */ +#define _PAGE_SWP_UFFD_WP \ + ((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \ + _PAGE_USER : 0) +#else +#define _PAGE_UFFD_WP 0 +#define _PAGE_SWP_UFFD_WP 0 +#endif + #define _PAGE_TABLE _PAGE_PRESENT /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 049ba0e64f94..1c311193e7da 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -417,6 +417,41 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(pte_val(pte) & ~(_PAGE_WRITE)); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +#define pgtable_supports_uffd_wp() \ + riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B) + +static inline bool pte_uffd_wp(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_UFFD_WP); +} + +static inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte_wrprotect(__pte(pte_val(pte) | _PAGE_UFFD_WP)); +} + +static inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_UFFD_WP)); +} + +static inline bool pte_swp_uffd_wp(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SWP_UFFD_WP); +} + +static inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_UFFD_WP); +} + +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_SWP_UFFD_WP)); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + /* static inline pte_t pte_mkread(pte_t pte) */ static inline pte_t pte_mkwrite_novma(pte_t pte) @@ -841,6 +876,38 @@ static inline pud_t pud_mkspecial(pud_t pud) } #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline bool pmd_uffd_wp(pmd_t pmd) +{ + return pte_uffd_wp(pmd_pte(pmd)); +} + +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pte_pmd(pte_mkuffd_wp(pmd_pte(pmd))); +} + +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd))); +} + +static inline bool pmd_swp_uffd_wp(pmd_t pmd) +{ + return pte_swp_uffd_wp(pmd_pte(pmd)); +} + +static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) +{ + return pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd))); +} + +static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) +{ + return pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd))); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline bool pmd_soft_dirty(pmd_t pmd) { @@ -1075,6 +1142,7 @@ static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) * bit 0: _PAGE_PRESENT (zero) * bit 1 to 2: (zero) * bit 3: _PAGE_SWP_SOFT_DIRTY + * bit 4: _PAGE_SWP_UFFD_WP * bit 5: _PAGE_PROT_NONE (zero) * bit 6: exclusive marker * bits 7 to 11: swap type From 519912bdaee8ae8529241d4763326e6120489459 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:06 +0800 Subject: [PATCH 293/321] dt-bindings: riscv: Add Svrsw60t59b extension description Add description for the Svrsw60t59b extension (PTE Reserved for SW bits 60:59) extension which was ratified recently in riscv-non-isa/riscv-iommu. Link: https://lkml.kernel.org/r/20251113072806.795029-7-zhangchunyan@iscas.ac.cn Acked-by: Conor Dooley Signed-off-by: Chunyan Zhang Cc: Albert Ou Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Al Viro Cc: Andrew Jones Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: David Hildenbrand Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- Documentation/devicetree/bindings/riscv/extensions.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index 543ac94718e8..194ef4754452 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -217,6 +217,12 @@ properties: memory types as ratified in the 20191213 version of the privileged ISA specification. + - const: svrsw60t59b + description: + The Svrsw60t59b extension for providing two more bits[60:59] to + PTE/PMD entry as ratified at commit 28bde925e7a7 ("PTE Reserved + for SW bits 60:59") of riscv-non-isa/riscv-iommu. + - const: svvptc description: The standard Svvptc supervisor-level extension for From 31807483d3952059d395c2a73b1fa9625db9b366 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Wed, 19 Nov 2025 17:59:43 +0800 Subject: [PATCH 294/321] mm/memory-failure: remove the selection of RAS commit 97f0b13452198290799f ("tracing: add trace event for memory-failure") introduces the selection of RAS in memory-failure. This commit is just a tracing feature; in reality, there is no dependency between memory-failure and RAS. RAS increases the size of the bzImage image by 8k, which is very valuable for embedded devices. Move the memory-failure traceing code from ras_event.h to memory-failure.h and remove the selection of RAS. Link: https://lkml.kernel.org/r/20251119095943.67125-1-xieyuanbin1@huawei.com Signed-off-by: Xie Yuanbin Acked-by: David Hildenbrand (Red Hat) Acked-by: Miaohe Lin Cc: Borislav Petkov Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/ras/ras_event.h | 87 ------------------------ include/trace/events/memory-failure.h | 98 +++++++++++++++++++++++++++ mm/Kconfig | 1 - mm/memory-failure.c | 5 +- 5 files changed, 103 insertions(+), 89 deletions(-) create mode 100644 include/trace/events/memory-failure.h diff --git a/MAINTAINERS b/MAINTAINERS index 5ca4caf73021..302c57deffac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11560,6 +11560,7 @@ R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained F: include/linux/memory-failure.h +F: include/trace/events/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index fecfeb7c8be7..1e5e87020eef 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -12,7 +12,6 @@ #include #include #include -#include /* * MCE Extended Error Log trace event @@ -339,92 +338,6 @@ TRACE_EVENT(aer_event, "Not available") ); #endif /* CONFIG_PCIEAER */ - -/* - * memory-failure recovery action result event - * - * unsigned long pfn - Page Frame Number of the corrupted page - * int type - Page types of the corrupted page - * int result - Result of recovery action - */ - -#ifdef CONFIG_MEMORY_FAILURE -#define MF_ACTION_RESULT \ - EM ( MF_IGNORED, "Ignored" ) \ - EM ( MF_FAILED, "Failed" ) \ - EM ( MF_DELAYED, "Delayed" ) \ - EMe ( MF_RECOVERED, "Recovered" ) - -#define MF_PAGE_TYPE \ - EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ - EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ - EM ( MF_MSG_HUGE, "huge page" ) \ - EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ - EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ - EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ - EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ - EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ - EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ - EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ - EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ - EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ - EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ - EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ - EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ - EM ( MF_MSG_BUDDY, "free buddy page" ) \ - EM ( MF_MSG_DAX, "dax page" ) \ - EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ - EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ - EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ - EMe ( MF_MSG_UNKNOWN, "unknown page" ) - -/* - * First define the enums in MM_ACTION_RESULT to be exported to userspace - * via TRACE_DEFINE_ENUM(). - */ -#undef EM -#undef EMe -#define EM(a, b) TRACE_DEFINE_ENUM(a); -#define EMe(a, b) TRACE_DEFINE_ENUM(a); - -MF_ACTION_RESULT -MF_PAGE_TYPE - -/* - * Now redefine the EM() and EMe() macros to map the enums to the strings - * that will be printed in the output. - */ -#undef EM -#undef EMe -#define EM(a, b) { a, b }, -#define EMe(a, b) { a, b } - -TRACE_EVENT(memory_failure_event, - TP_PROTO(unsigned long pfn, - int type, - int result), - - TP_ARGS(pfn, type, result), - - TP_STRUCT__entry( - __field(unsigned long, pfn) - __field(int, type) - __field(int, result) - ), - - TP_fast_assign( - __entry->pfn = pfn; - __entry->type = type; - __entry->result = result; - ), - - TP_printk("pfn %#lx: recovery action for %s: %s", - __entry->pfn, - __print_symbolic(__entry->type, MF_PAGE_TYPE), - __print_symbolic(__entry->result, MF_ACTION_RESULT) - ) -); -#endif /* CONFIG_MEMORY_FAILURE */ #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ diff --git a/include/trace/events/memory-failure.h b/include/trace/events/memory-failure.h new file mode 100644 index 000000000000..aa57cc8f896b --- /dev/null +++ b/include/trace/events/memory-failure.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memory_failure +#define TRACE_INCLUDE_FILE memory-failure + +#if !defined(_TRACE_MEMORY_FAILURE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MEMORY_FAILURE_H + +#include +#include + +/* + * memory-failure recovery action result event + * + * unsigned long pfn - Page Frame Number of the corrupted page + * int type - Page types of the corrupted page + * int result - Result of recovery action + */ + +#define MF_ACTION_RESULT \ + EM ( MF_IGNORED, "Ignored" ) \ + EM ( MF_FAILED, "Failed" ) \ + EM ( MF_DELAYED, "Delayed" ) \ + EMe ( MF_RECOVERED, "Recovered" ) + +#define MF_PAGE_TYPE \ + EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ + EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ + EM ( MF_MSG_HUGE, "huge page" ) \ + EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ + EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ + EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ + EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ + EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ + EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ + EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ + EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ + EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ + EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ + EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ + EM ( MF_MSG_BUDDY, "free buddy page" ) \ + EM ( MF_MSG_DAX, "dax page" ) \ + EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ + EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ + EMe ( MF_MSG_UNKNOWN, "unknown page" ) + +/* + * First define the enums in MM_ACTION_RESULT to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +MF_ACTION_RESULT +MF_PAGE_TYPE + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(memory_failure_event, + TP_PROTO(unsigned long pfn, + int type, + int result), + + TP_ARGS(pfn, type, result), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(int, type) + __field(int, result) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->type = type; + __entry->result = result; + ), + + TP_printk("pfn %#lx: recovery action for %s: %s", + __entry->pfn, + __print_symbolic(__entry->type, MF_PAGE_TYPE), + __print_symbolic(__entry->result, MF_ACTION_RESULT) + ) +); +#endif /* _TRACE_MEMORY_FAILURE_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/Kconfig b/mm/Kconfig index d548976d0e0a..bd0ea5454af8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -740,7 +740,6 @@ config MEMORY_FAILURE depends on MMU depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" - select RAS select INTERVAL_TREE help Enables code to recover from some memory failures on systems diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7f908ad795ad..fbc5a01260c8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -61,9 +61,12 @@ #include #include #include + +#define CREATE_TRACE_POINTS +#include + #include "swap.h" #include "internal.h" -#include "ras/ras_event.h" static int sysctl_memory_failure_early_kill __read_mostly; From ecf371b2cabee6a2bfb61f87d71c9f02a9ff34d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Nov 2025 04:26:35 +0000 Subject: [PATCH 295/321] mm: tweak __vma_enter_locked() Move the commentary on how __vma_enter_locked() behaves from the body of __vma_start_write() to the head of __vma_enter_locked() and merge it with the existing documentation. Also add a call to mmap_assert_write_locked(). Link: https://lkml.kernel.org/r/20251119042639.3937024-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Lorenzo Stoakes Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap_lock.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f2532af6208c..e6e5570d1ec7 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,9 +46,9 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK /* - * Return value: 0 if vma detached, - * 1 if vma attached with no readers, - * -EINTR if signal received, + * __vma_enter_locked() returns 0 immediately if the vma is not + * attached, otherwise it waits for any current readers to finish and + * returns 1. Returns -EINTR if a signal is received while waiting. */ static inline int __vma_enter_locked(struct vm_area_struct *vma, bool detaching, int state) @@ -56,6 +56,8 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, int err; unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + mmap_assert_write_locked(vma->vm_mm); + /* Additional refcnt if the vma is attached. */ if (!detaching) tgt_refcnt++; @@ -91,11 +93,6 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, { int locked; - /* - * __vma_enter_locked() returns false immediately if the vma is not - * attached, otherwise it waits until refcnt is indicating that vma - * is attached with no readers. - */ locked = __vma_enter_locked(vma, false, state); if (locked < 0) return locked; From 04d31610a7221cca624646241b1f6b3edd6c99fd Mon Sep 17 00:00:00 2001 From: Yuwen Chen Date: Wed, 19 Nov 2025 12:13:45 +0800 Subject: [PATCH 296/321] zram: fix the issue that the write - back limits might overflow When the page size exceeds 4KB, if bd_wb_limit is set to a value that is not aligned with the page size, it will cause a numerical wrap-around issue for bd_wb_limit. For example, when the page size is set to 16KB and bd_wb_limit is set to 3, after one write-back operation, the value of bd_wb_limit will become -1. More seriously, since bd_wb_limit is an unsigned number, its value may become as large as 2^64 - 1. The core reason for this problem is that the unit of bd_wb_limit is 4KB. For example, when a write-back occurs on a system with a page size of 16KB, 4 needs to be subtracted from bd_wb_limit. This operation takes place in the zram_account_writeback_submit function. This patch fixes the issue by limiting bd_wb_limit to be an integer multiple of PAGE_SIZE / 4096. Link: https://lkml.kernel.org/r/tencent_5936CFE72BAB2BA76887BB69DCC1B5E67C05@qq.com Fixes: 1d69a3f8ae77 ("zram: idle writeback fixes and cleanup") Signed-off-by: Yuwen Chen Acked-by: Sergey Senozhatsky Cc: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3428f647d0a7..1a1159a70fb4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -566,6 +566,16 @@ static ssize_t writeback_limit_store(struct device *dev, if (kstrtoull(buf, 10, &val)) return ret; + /* + * When the page size is greater than 4KB, if bd_wb_limit is set to + * a value that is not page - size aligned, it will cause value + * wrapping. For example, when the page size is set to 16KB and + * bd_wb_limit is set to 3, a single write - back operation will + * cause bd_wb_limit to become -1. Even more terrifying is that + * bd_wb_limit is an unsigned number. + */ + val = rounddown(val, PAGE_SIZE / 4096); + down_write(&zram->init_lock); zram->bd_wb_limit = val; up_write(&zram->init_lock); From ccf9eb326b4aabcd61e71d87469cafd6c5f01308 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 21 Nov 2025 17:25:18 +0000 Subject: [PATCH 297/321] tools/testing/vma: add missing stub vm_flags_reset() is not available in the userland VMA tests, so add a stub which const-casts vma->vm_flags and avoids the upcoming removal of the vma->__vm_flags field. Link: https://lkml.kernel.org/r/4aff8bf7-d367-4ba3-90ad-13eef7a063fa@lucifer.local Fixes: c5c67c1de357 ("tools/testing/vma: eliminate dependency on vma->__vm_flags") Signed-off-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index be99056c5d56..8c2ac301a00e 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1549,4 +1549,11 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, return 0; } +static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) +{ + vm_flags_t *dst = (vm_flags_t *)(&vma->vm_flags); + + *dst = flags; +} + #endif /* __MM_VMA_INTERNAL_H */ From 348ced3da52b3161f5ceec8868e81973ce48e11d Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 21 Nov 2025 14:48:59 -0500 Subject: [PATCH 298/321] hugetlb: add __read_mostly to sysctl_hugetlb_shm_group sysctl bits are mostly-read values. Link: https://lkml.kernel.org/r/20251121194859.265259-2-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Andrew Morton Acked-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 457d48ac7bcd..019a1c5281e4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); -extern int sysctl_hugetlb_shm_group; +extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); From a9ce09b15761e0f3a413a4a79097d17e19bd3ec1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Nov 2025 16:36:38 +0300 Subject: [PATCH 299/321] mm/damon/tests/sysfs-kunit: fix use after free on error path Re-order these frees to avoid dereferencing "sysfs_target" after it has been freed. Link: https://lkml.kernel.org/r/aSBq5uSPIqsqH8zO@stanley.mountain Fixes: ee131696794c ("mm/damon/tests/sysfs-kunit: handle alloc failures on damon_sysfs_test_add_targets()") Signed-off-by: Dan Carpenter Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/sysfs-kunit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/tests/sysfs-kunit.h b/mm/damon/tests/sysfs-kunit.h index ce7218469f20..0c665ed255a3 100644 --- a/mm/damon/tests/sysfs-kunit.h +++ b/mm/damon/tests/sysfs-kunit.h @@ -76,8 +76,8 @@ static void damon_sysfs_test_add_targets(struct kunit *test) if (!ctx) { kfree(sysfs_targets->targets_arr); kfree(sysfs_targets); - kfree(sysfs_target); kfree(sysfs_target->regions); + kfree(sysfs_target); kunit_skip(test, "ctx alloc fail"); } From c23071952394eaef4f06d1b71a90392d2650af0d Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Wed, 19 Nov 2025 20:41:36 +0900 Subject: [PATCH 300/321] mm/swap: fix wrong plist empty check in swap_alloc_slow() swap_alloc_slow() was checking `si->avail_list` instead of `next->avail_list` when verifying if the next swap device is still in the list, which could cause unnecessary restarts during allocation. Link: https://lkml.kernel.org/r/20251119114136.594108-1-youngjun.park@lge.com Fixes: 8e689f8ea45f ("mm/swap: do not choose swap device according to numa node") Signed-off-by: Youngjun Park Acked-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 94e0f0c54168..cf780fefaf7d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1374,7 +1374,7 @@ start_over: * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ - if (plist_node_empty(&si->avail_list)) + if (plist_node_empty(&next->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); From cb65082a0ab44ccdd574d61feb59a857f2b1c914 Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Fri, 31 Oct 2025 15:50:07 +0900 Subject: [PATCH 301/321] mm, swap: fix memory leak in setup_clusters() error path Patch series "mm: swap: small fixes and comment cleanups", v2. This series provides a few small fixes and cleanups for the swap code. The first patch fixes a memory leak in an error path that was recently introduced. The subsequent patches include minor logic adjustments and the removal of redundant comments. This patch (of 5): setup_clusters() could leak 'cluster_info' memory if an error occurred on a path that did not jump to the 'err_free' label. This patch simplifies the error handling by removing the goto label and instead calling free_cluster_info() on all error exit paths. The new logic is safe, as free_cluster_info() already handles NULL pointer inputs. Link: https://lkml.kernel.org/r/20251031065011.40863-1-youngjun.park@lge.com Link: https://lkml.kernel.org/r/20251031065011.40863-2-youngjun.park@lge.com Fixes: 07adc4cf1ecd ("mm, swap: implement dynamic allocation of swap table") Signed-off-by: Youngjun Park Reviewed-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index cf780fefaf7d..0a822e0d9bf9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3330,7 +3330,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); if (!si->global_cluster) - goto err_free; + goto err; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock); @@ -3383,9 +3383,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; -err_free: - free_cluster_info(cluster_info, maxpages); err: + free_cluster_info(cluster_info, maxpages); return ERR_PTR(err); } From 68f78bf55b2407947be2be7f854d1310cff607d9 Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Fri, 31 Oct 2025 15:50:08 +0900 Subject: [PATCH 302/321] mm, swap: use SWP_SOLIDSTATE to determine if swap is rotational The current non rotational check is unreliable as the device's rotational status can be changed by a user via sysfs. Use the more reliable SWP_SOLIDSTATE flag which is set at swapon time, to ensure the nr_rotate_swap count remains consistent. Plus, it is easy to read and simple. Link: https://lkml.kernel.org/r/20251031065011.40863-3-youngjun.park@lge.com Fixes: 81a0298bdfab ("mm, swap: don't use VMA based swap readahead if HDD is used as swap") Signed-off-by: Youngjun Park Reviewed-by: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 0a822e0d9bf9..c8ac0fd51c4c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2910,7 +2910,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); - if (!p->bdev || !bdev_nonrot(p->bdev)) + if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); From f1bae15c6ad59870eca7824c66e40ee7469ee0fd Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Fri, 31 Oct 2025 15:50:09 +0900 Subject: [PATCH 303/321] mm, swap: remove redundant comment for read_swap_cache_async The function now manages get/put_swap_device() internally, making the comment explaining this behavior to callers unnecessary. Link: https://lkml.kernel.org/r/20251031065011.40863-4-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swap_state.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index b2230f8a48fc..5f97c6ae70a2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -509,10 +509,6 @@ put_and_return: * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. - * - * get/put_swap_device() aren't needed to call this function, because - * __read_swap_cache_async() call them and swap_read_folio() holds the - * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, From 4c239d5f594e81f05a78d7a6c8b80a039e393970 Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Fri, 31 Oct 2025 15:50:10 +0900 Subject: [PATCH 304/321] mm: swap: change swap_alloc_slow() to void swap_alloc_slow() does not need to return a bool, as all callers handle allocation results via the entry parameter. Update the function signature and remove return statements accordingly. Link: https://lkml.kernel.org/r/20251031065011.40863-5-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Kairui Song Reviewed-by: Baoquan He Acked-by: Chris Li Cc: Barry Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c8ac0fd51c4c..9df2880c284f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1339,7 +1339,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, } /* Rotate the device and switch to a new cluster */ -static bool swap_alloc_slow(swp_entry_t *entry, +static void swap_alloc_slow(swp_entry_t *entry, int order) { unsigned long offset; @@ -1356,10 +1356,10 @@ start_over: put_swap_device(si); if (offset) { *entry = swp_entry(si->type, offset); - return true; + return; } if (order) - return false; + return; } spin_lock(&swap_avail_lock); @@ -1378,7 +1378,6 @@ start_over: goto start_over; } spin_unlock(&swap_avail_lock); - return false; } /* From b7dd80f8f92848fa26518119f2c378dad8b7c0da Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Fri, 31 Oct 2025 15:50:11 +0900 Subject: [PATCH 305/321] mm: swap: remove scan_swap_map_slots() references from comments The scan_swap_map_slots() helper has been removed, but several comments still referred to it in swap allocation and reclaim paths. This patch cleans up those outdated references and reflows the affected comment blocks to match kernel coding style. Link: https://lkml.kernel.org/r/20251031065011.40863-6-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Baoquan He Acked-by: Chris Li Cc: Barry Song Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9df2880c284f..d12332423a06 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -236,11 +236,10 @@ again: ret = -nr_pages; /* - * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming folios. So we hold a folio lock - * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use folio_free_swap() with explicit folio_lock() - * in usual operations. + * We hold a folio lock here. We have to use trylock for + * avoiding deadlock. This is a special case and you should + * use folio_free_swap() with explicit folio_lock() in usual + * operations. */ if (!folio_trylock(folio)) goto out; @@ -1365,14 +1364,13 @@ start_over: spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, - * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si - * filled up between us dropping swap_avail_lock and taking - * si->lock. Since we dropped the swap_avail_lock, the - * swap_avail_head list may have been modified; so if next is - * still in the swap_avail_head list then try it, otherwise - * start over if we have not gotten any slots. + * filled up between us dropping swap_avail_lock. + * Since we dropped the swap_avail_lock, the swap_avail_list + * may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over if we + * have not gotten any slots. */ if (plist_node_empty(&next->avail_list)) goto start_over; From 84a8d467cc426eb3c9eb34092423dcc54493dd7e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sat, 22 Nov 2025 22:19:20 +0100 Subject: [PATCH 306/321] pagemap: update BUDDY flag documentation Since v4.6 the BUDDY flag is set for _all_ pages in the block and no longer just for the first one. This change was introduced by: commit 832fc1de01ae ("/proc/kpageflags: return KPF_BUDDY for "tail" buddy pages") Strictly speaking, this was an ABI change, but as nobody has noticed since 2016, let's just update the documentation. Link: https://lkml.kernel.org/r/20251122211920.3410371-1-richard@nod.at Signed-off-by: Richard Weinberger Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Mike Rapoport (Microsoft) Cc: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index e60e9211fd9b..c57e61b5d8aa 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -115,7 +115,8 @@ Short descriptions to the page flags A free memory block managed by the buddy system allocator. The buddy system organizes free memory in blocks of various orders. An order N block has 2^N physically contiguous pages, with the BUDDY flag - set for and _only_ for the first page. + set for all pages. + Before 4.6 only the first page of the block had the flag set. 15 - COMPOUND_HEAD A compound page with order N consists of 2^N physically contiguous pages. A compound page with order 2 takes the form of "HTTT", where H donates its From 3cf41edc2067de9265f9f58b905317723c59a0c7 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 24 Oct 2025 10:27:11 +0800 Subject: [PATCH 307/321] mm/vmscan: skip increasing kswapd_failures when reclaim was boosted We have a colocation cluster used for deploying both offline and online services simultaneously. In this environment, we encountered a scenario where direct memory reclamation was triggered due to kswapd not running. 1. When applications start up, rapidly consume memory, or experience network traffic bursts, the kernel reaches steal_suitable_fallback(), which sets watermark_boost and subsequently wakes kswapd. 2. In the core logic of kswapd thread (balance_pgdat()), when reclaim is triggered by watermark_boost, the maximum priority is 10. Higher priority values mean less aggressive LRU scanning, which can result in no pages being reclaimed during a single scan cycle: if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) raise_priority = false; 3. Additionally, many of our pods are configured with memory.low, which prevents memory reclamation in certain cgroups, further increasing the chance of failing to reclaim memory. 4. This eventually causes pgdat->kswapd_failures to continuously accumulate, exceeding MAX_RECLAIM_RETRIES, and consequently kswapd stops working. At this point, the system's available memory is still significantly above the high watermark -- it's inappropriate for kswapd to stop under these conditions. The final observable issue is that a brief period of rapid memory allocation causes kswapd to stop running, ultimately triggering direct reclaim and making the applications unresponsive. This problem leading to direct memory reclamation has been a long-standing issue in our production environment. We initially held the simple assumption that it was caused by applications allocating memory too rapidly for kswapd to keep up with reclamation. However, after we began monitoring kswapd's runtime behavior, we discovered a different pattern: kswapd initially exhibits very aggressive activity even when there is still considerable free memory, but it subsequently stops running entirely, even as memory levels approach the low watermark. In summary, both boosted watermarks and memory.low increase the probability of kswapd operation failures. This patch specifically addresses the scenario involving boosted watermarks by not incrementing kswapd_failures when reclamation fails. A more general solution, potentially addressing memory.low or other cases, requires further discussion. Link: https://lkml.kernel.org/r/53de0b3ee0b822418e909db29bfa6513faff9d36@linux.dev Link: https://lkml.kernel.org/r/20251024022711.382238-1-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Reviewed-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 720772baf2a7..92980b072121 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7127,7 +7127,12 @@ restart: goto restart; } - if (!sc.nr_reclaimed) + /* + * If the reclaim was boosted, we might still be far from the + * watermark_high at this point. We need to avoid increasing the + * failure count to prevent the kswapd thread from stopping. + */ + if (!sc.nr_reclaimed && !boosted) atomic_inc(&pgdat->kswapd_failures); out: From a493c7a65074e182e52306fc05b10896f9634855 Mon Sep 17 00:00:00 2001 From: fujunjie Date: Sat, 15 Nov 2025 03:02:55 +0000 Subject: [PATCH 308/321] mm/page_alloc: optimize lowmem_reserve max lookup using its semantic monotonicity calculate_totalreserve_pages() currently finds the maximum lowmem_reserve[j] for a zone by scanning the full forward range [j = zone_idx .. MAX_NR_ZONES). However, for a given zone i, the lowmem_reserve[j] array (for j > i) is naturally expected to form a monotonically non-decreasing sequence in j, not as an implementation detail, but as a consequence that naturally arises from the semantics of lowmem_reserve[]. For zone "i", lowmem_reserve[j] expresses how many pages in zone i must effectively be kept in reserve when deciding whether an allocation class that may allocate from zones up to j is allowed to fall back into i. It protects less flexible allocation classes (which cannot use higher zones) from being starved by more flexible ones. Viewed from this semantics, it is natural to expect a partial ordering in j: as j increases, the allocation class gains access to a strictly larger set of fallback zones. Therefore lowmem_reserve[j] is expected to be monotonically non-decreasing in j: more flexible allocation classes must not be allowed to deplete low zones more aggressively than less flexible ones. In other words, if lowmem_reserve[j] were ever observed to *decrease* as j grows, that would be unexpected from the reserve semantics' point of view and would likely indicate a semantic change or a misconfiguration. The current implementation in setup_per_zone_lowmem_reserve() reflects this policy by accumulating managed pages from higher zones and applying the configured ratio, which results in a non-decreasing sequence. This patch makes calculate_totalreserve_pages() rely on that monotonicity explicitly and finds the maximum reserve value by scanning backward and stopping at the first non-zero entry. This avoids unnecessary iteration and reflects the conceptual model more directly. No functional behavior changes. To maintain this assumption explicitly, a comment is added next to setup_per_zone_lowmem_reserve() documenting the monotonicity expectation and noting that calculate_totalreserve_pages() relies on it. Link: https://lkml.kernel.org/r/tencent_EB0FED91B01B1F8B6DAEE96719C5F5797F07@qq.com Signed-off-by: fujunjie Acked-by: Zi Yan Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26be5734253f..ec011a9e5cbb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6311,10 +6311,21 @@ static void calculate_totalreserve_pages(void) long max = 0; unsigned long managed_pages = zone_managed_pages(zone); - /* Find valid and maximum lowmem_reserve in the zone */ - for (j = i; j < MAX_NR_ZONES; j++) - max = max(max, zone->lowmem_reserve[j]); + /* + * lowmem_reserve[j] is monotonically non-decreasing + * in j for a given zone (see + * setup_per_zone_lowmem_reserve()). The maximum + * valid reserve lives at the highest index with a + * non-zero value, so scan backwards and stop at the + * first hit. + */ + for (j = MAX_NR_ZONES - 1; j > i; j--) { + if (!zone->lowmem_reserve[j]) + continue; + max = zone->lowmem_reserve[j]; + break; + } /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); @@ -6339,7 +6350,21 @@ static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; enum zone_type i, j; - + /* + * For a given zone node_zones[i], lowmem_reserve[j] (j > i) + * represents how many pages in zone i must effectively be kept + * in reserve when deciding whether an allocation class that is + * allowed to allocate from zones up to j may fall back into + * zone i. + * + * As j increases, the allocation class can use a strictly larger + * set of fallback zones and therefore must not be allowed to + * deplete low zones more aggressively than a less flexible one. + * As a result, lowmem_reserve[j] is required to be monotonically + * non-decreasing in j for each zone i. Callers such as + * calculate_totalreserve_pages() rely on this monotonicity when + * selecting the maximum reserve entry. + */ for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES - 1; i++) { struct zone *zone = &pgdat->node_zones[i]; From 8f4338b1141e1e44a405ce245d18a29a834d32b6 Mon Sep 17 00:00:00 2001 From: Chu Guangqing Date: Tue, 25 Nov 2025 10:05:22 +0800 Subject: [PATCH 309/321] zram: fix a spelling mistake The spelling of the word "relases" is incorrect; it should be "releases". Link: https://lkml.kernel.org/r/20251125020522.1913-1-chuguangqing@inspur.com Signed-off-by: Chu Guangqing Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1a1159a70fb4..5759823d6314 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1043,7 +1043,7 @@ static int zram_writeback_slots(struct zram *zram, index = pps->index; zram_slot_lock(zram, index); /* - * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so + * scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so * slots can change in the meantime. If slots are accessed or * freed they lose ZRAM_PP_SLOT flag and hence we don't * post-process them. From 2b6a3f061f11372af79b862d6184d43193ae927f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:00:59 +0000 Subject: [PATCH 310/321] mm: declare VMA flags by bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "initial work on making VMA flags a bitmap", v3. We are in the rather silly situation that we are running out of VMA flags as they are currently limited to a system word in size. This leads to absurd situations where we limit features to 64-bit architectures only because we simply do not have the ability to add a flag for 32-bit ones. This is very constraining and leads to hacks or, in the worst case, simply an inability to implement features we want for entirely arbitrary reasons. This also of course gives us something of a Y2K type situation in mm where we might eventually exhaust all of the VMA flags even on 64-bit systems. This series lays the groundwork for getting away from this limitation by establishing VMA flags as a bitmap whose size we can increase in future beyond 64 bits if required. This is necessarily a highly iterative process given the extensive use of VMA flags throughout the kernel, so we start by performing basic steps. Firstly, we declare VMA flags by bit number rather than by value, retaining the VM_xxx fields but in terms of these newly introduced VMA_xxx_BIT fields. While we are here, we use sparse annotations to ensure that, when dealing with VMA bit number parameters, we cannot be passed values which are not declared as such - providing some useful type safety. We then introduce an opaque VMA flag type, much like the opaque mm_struct flag type introduced in commit bb6525f2f8c4 ("mm: add bitmap mm->flags field"), which we establish in union with vma->vm_flags (but still set at system word size meaning there is no functional or data type size change). We update the vm_flags_xxx() helpers to use this new bitmap, introducing sensible helpers to do so. This series lays the foundation for further work to expand the use of bitmap VMA flags and eventually eliminate these arbitrary restrictions. This patch (of 4): In order to lay the groundwork for VMA flags being a bitmap rather than a system word in size, we need to be able to consistently refer to VMA flags by bit number rather than value. Take this opportunity to do so in an enum which we which is additionally useful for tooling to extract metadata from. This additionally makes it very clear which bits are being used for what at a glance. We use the VMA_ prefix for the bit values as it is logical to do so since these reference VMAs. We consistently suffix with _BIT to make it clear what the values refer to. We declare bit values even when the flags that use them would not be enabled by config options as this is simply clearer and clearly defines what bit numbers are used for what, at no additional cost. We declare a sparse-bitwise type vma_flag_t which ensures that users can't pass around invalid VMA flags by accident and prepares for future work towards VMA flags being a bitmap where we want to ensure bit values are type safe. To make life easier, we declare some macro helpers - DECLARE_VMA_BIT() allows us to avoid duplication in the enum bit number declarations (and maintaining the sparse __bitwise attribute), and INIT_VM_FLAG() is used to assist with declaration of flags. Unfortunately we can't declare both in the enum, as we run into issue with logic in the kernel requiring that flags are preprocessor definitions, and additionally we cannot have a macro which declares another macro so we must define each flag macro directly. Additionally, update the VMA userland testing vma_internal.h header to include these changes. We also have to fix the parameters to the vma_flag_*_atomic() functions since VMA_MAYBE_GUARD_BIT is now of type vma_flag_t and sparse will complain otherwise. We have to update some rather silly if-deffery found in mm/task_mmu.c which would otherwise break. Finally, we update the rust binding helper as now it cannot auto-detect the flags at all. Link: https://lkml.kernel.org/r/cover.1764064556.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3a35e5a0bcfa00e84af24cbafc0653e74deda64a.1764064556.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 397 +++++++++++++++++-------------- mm/khugepaged.c | 2 +- mm/madvise.c | 2 +- rust/bindgen_parameters | 25 ++ rust/bindings/bindings_helper.h | 25 ++ tools/testing/vma/vma_internal.h | 306 ++++++++++++++++++++---- 7 files changed, 534 insertions(+), 227 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2b4ab5718ab5..d00ac179d973 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1183,10 +1183,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", -#if VM_PKEY_BIT3 +#if CONFIG_ARCH_PKEY_BITS > 3 [ilog2(VM_PKEY_BIT3)] = "", #endif -#if VM_PKEY_BIT4 +#if CONFIG_ARCH_PKEY_BITS > 4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 75f894c3f521..a2f38fb68840 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -271,187 +271,241 @@ extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif -#define VM_MAYBE_GUARD_BIT 11 - /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 - -/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 - -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ -#ifdef CONFIG_MMU -#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ -#else /* CONFIG_MMU */ -#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ -#define VM_UFFD_MISSING 0 -#endif /* CONFIG_MMU */ -#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */ -#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ - -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_SYNC 0x00800000 /* Synchronous page faults */ -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ -#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ - -#ifdef CONFIG_MEM_SOFT_DIRTY -# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ -#else -# define VM_SOFTDIRTY 0 -#endif - -#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ - -#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS -#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) -#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) -#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) -#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) -#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) -#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ - -#ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 -#if CONFIG_ARCH_PKEY_BITS > 3 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 -#else -# define VM_PKEY_BIT3 0 -#endif -#if CONFIG_ARCH_PKEY_BITS > 4 -# define VM_PKEY_BIT4 VM_HIGH_ARCH_4 -#else -# define VM_PKEY_BIT4 0 -#endif -#endif /* CONFIG_ARCH_HAS_PKEYS */ - -#ifdef CONFIG_X86_USER_SHADOW_STACK -/* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. * - * These VMAs will get a single end guard page. This helps userspace protect - * itself from attacks. A single page is enough for current shadow stack archs - * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c - * for more details on the guard size. + * This value is made type safe by sparse to avoid passing invalid flag values + * around. */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#endif +typedef int __bitwise vma_flag_t; -#if defined(CONFIG_ARM64_GCS) -/* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_6 +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), #endif - -#ifndef VM_SHADOW_STACK -# define VM_SHADOW_STACK VM_NONE + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), #endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; +#undef DECLARE_VMA_BIT +#undef DECLARE_VMA_BIT_ALIAS +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif #if defined(CONFIG_PPC64) -# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#define VM_SAO INIT_VM_FLAG(SAO) #elif defined(CONFIG_PARISC) -# define VM_GROWSUP VM_ARCH_1 +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) #elif defined(CONFIG_SPARC64) -# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ -# define VM_ARCH_CLEAR VM_SPARC_ADI +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif defined(CONFIG_ARM64) -# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ -# define VM_ARCH_CLEAR VM_ARM64_BTI +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif !defined(CONFIG_MMU) -# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) #endif - -#if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ -#else -# define VM_MTE VM_NONE -# define VM_MTE_ALLOWED VM_NONE -#endif - #ifndef VM_GROWSUP -# define VM_GROWSUP VM_NONE +#define VM_GROWSUP VM_NONE #endif - -#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 41 -# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ -#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -# define VM_UFFD_MINOR VM_NONE -#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ - -/* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED_BIT 39 -#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) #else -#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE #endif - #ifdef CONFIG_64BIT -#define VM_DROPPABLE_BIT 40 -#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) -#elif defined(CONFIG_PPC32) -#define VM_DROPPABLE VM_ARCH_1 +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) #else #define VM_DROPPABLE VM_NONE #endif -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) -#else -#define VM_SEALED VM_NONE -#endif - /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) @@ -475,12 +529,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_SEALED_SYSMAP VM_NONE #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) @@ -488,7 +540,6 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - /* * Special vmas that are non-mergable, non-mlock()able. */ @@ -523,7 +574,7 @@ extern unsigned int kobjsize(const void *objp); /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR -# define VM_ARCH_CLEAR VM_NONE +#define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) @@ -920,9 +971,9 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, } static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, - int bit) + vma_flag_t bit) { - const vm_flags_t mask = BIT(bit); + const vm_flags_t mask = BIT((__force int)bit); /* Only specific flags are permitted */ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) @@ -935,14 +986,15 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { /* mmap read lock/VMA read lock must be held. */ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); if (__vma_flag_atomic_valid(vma, bit)) - set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags)); + set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags)); } /* @@ -952,10 +1004,11 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit) +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_flag_atomic_valid(vma, bit)) - return test_bit(bit, &vma->vm_flags); + return test_bit((__force int)bit, &vma->vm_flags); return false; } @@ -4517,16 +4570,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); - -/* - * mseal of userspace process's system mappings. - */ -#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS -#define VM_SEALED_SYSMAP VM_SEALED -#else -#define VM_SEALED_SYSMAP VM_NONE -#endif - /* * DMA mapping IDs for page_pool * diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 89c33ef7aac3..97d1b2824386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1740,7 +1740,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ - if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT)) + if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; diff --git a/mm/madvise.c b/mm/madvise.c index d8bc51e1bea7..b617b1be0f53 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1142,7 +1142,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * acquire an mmap/VMA write lock to read it. All remaining readers may * or may not see the flag set, but we don't care. */ - vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT); + vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT); /* * If anonymous and we are establishing page tables the VMA ought to diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters index e13c6f9dd17b..fd2fd1c3cb9a 100644 --- a/rust/bindgen_parameters +++ b/rust/bindgen_parameters @@ -35,6 +35,31 @@ # recognized, block generation of the non-helper constants. --blocklist-item ARCH_SLAB_MINALIGN --blocklist-item ARCH_KMALLOC_MINALIGN +--blocklist-item VM_MERGEABLE +--blocklist-item VM_READ +--blocklist-item VM_WRITE +--blocklist-item VM_EXEC +--blocklist-item VM_SHARED +--blocklist-item VM_MAYREAD +--blocklist-item VM_MAYWRITE +--blocklist-item VM_MAYEXEC +--blocklist-item VM_MAYEXEC +--blocklist-item VM_PFNMAP +--blocklist-item VM_IO +--blocklist-item VM_DONTCOPY +--blocklist-item VM_DONTEXPAND +--blocklist-item VM_LOCKONFAULT +--blocklist-item VM_ACCOUNT +--blocklist-item VM_NORESERVE +--blocklist-item VM_HUGETLB +--blocklist-item VM_SYNC +--blocklist-item VM_ARCH_1 +--blocklist-item VM_WIPEONFORK +--blocklist-item VM_DONTDUMP +--blocklist-item VM_SOFTDIRTY +--blocklist-item VM_MIXEDMAP +--blocklist-item VM_HUGEPAGE +--blocklist-item VM_NOHUGEPAGE # Structs should implement `Zeroable` when all of their fields do. --with-derive-custom-struct .*=MaybeZeroable diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 2e43c66635a2..4c327db01ca0 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -108,7 +108,32 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; + const vm_flags_t RUST_CONST_HELPER_VM_MERGEABLE = VM_MERGEABLE; +const vm_flags_t RUST_CONST_HELPER_VM_READ = VM_READ; +const vm_flags_t RUST_CONST_HELPER_VM_WRITE = VM_WRITE; +const vm_flags_t RUST_CONST_HELPER_VM_EXEC = VM_EXEC; +const vm_flags_t RUST_CONST_HELPER_VM_SHARED = VM_SHARED; +const vm_flags_t RUST_CONST_HELPER_VM_MAYREAD = VM_MAYREAD; +const vm_flags_t RUST_CONST_HELPER_VM_MAYWRITE = VM_MAYWRITE; +const vm_flags_t RUST_CONST_HELPER_VM_MAYEXEC = VM_MAYEXEC; +const vm_flags_t RUST_CONST_HELPER_VM_MAYSHARE = VM_MAYEXEC; +const vm_flags_t RUST_CONST_HELPER_VM_PFNMAP = VM_PFNMAP; +const vm_flags_t RUST_CONST_HELPER_VM_IO = VM_IO; +const vm_flags_t RUST_CONST_HELPER_VM_DONTCOPY = VM_DONTCOPY; +const vm_flags_t RUST_CONST_HELPER_VM_DONTEXPAND = VM_DONTEXPAND; +const vm_flags_t RUST_CONST_HELPER_VM_LOCKONFAULT = VM_LOCKONFAULT; +const vm_flags_t RUST_CONST_HELPER_VM_ACCOUNT = VM_ACCOUNT; +const vm_flags_t RUST_CONST_HELPER_VM_NORESERVE = VM_NORESERVE; +const vm_flags_t RUST_CONST_HELPER_VM_HUGETLB = VM_HUGETLB; +const vm_flags_t RUST_CONST_HELPER_VM_SYNC = VM_SYNC; +const vm_flags_t RUST_CONST_HELPER_VM_ARCH_1 = VM_ARCH_1; +const vm_flags_t RUST_CONST_HELPER_VM_WIPEONFORK = VM_WIPEONFORK; +const vm_flags_t RUST_CONST_HELPER_VM_DONTDUMP = VM_DONTDUMP; +const vm_flags_t RUST_CONST_HELPER_VM_SOFTDIRTY = VM_SOFTDIRTY; +const vm_flags_t RUST_CONST_HELPER_VM_MIXEDMAP = VM_MIXEDMAP; +const vm_flags_t RUST_CONST_HELPER_VM_HUGEPAGE = VM_HUGEPAGE; +const vm_flags_t RUST_CONST_HELPER_VM_NOHUGEPAGE = VM_NOHUGEPAGE; #if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) #include "../../drivers/android/binder/rust_binder.h" diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 8c2ac301a00e..b7e8fc9ccdf4 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -46,42 +46,271 @@ extern unsigned long dac_mmap_min_addr; #define MMF_HAS_MDWE 28 +/* + * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h + */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 -#define VM_MAYREAD 0x00000010 -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_GROWSDOWN 0x00000100 -#define VM_PFNMAP 0x00000400 -#define VM_MAYBE_GUARD 0x00000800 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ -#define VM_DONTEXPAND 0x00040000 -#define VM_LOCKONFAULT 0x00080000 -#define VM_ACCOUNT 0x00100000 -#define VM_NORESERVE 0x00200000 -#define VM_MIXEDMAP 0x10000000 -#define VM_STACK VM_GROWSDOWN -#define VM_SHADOW_STACK VM_NONE -#define VM_SOFTDIRTY 0 -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_GROWSUP VM_NONE -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), #endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW @@ -97,26 +326,11 @@ extern unsigned long dac_mmap_min_addr; #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define CAP_IPC_LOCK 14 -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) -#else -#define VM_SEALED VM_NONE -#endif - /* * Flags which should be 'sticky' on merge - that is, flags which, when one VMA * possesses it but the other does not, the merged VMA should nonetheless have From 58eac97a8ba0bcfc5dffb347e40ea3006347ff38 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:00 +0000 Subject: [PATCH 311/321] mm: simplify and rename mm flags function for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The __mm_flags_set_word() function is slightly ambiguous - we use 'set' to refer to setting individual bits (such as in mm_flags_set()) but here we use it to refer to overwriting the value altogether. Rename it to __mm_flags_overwrite_word() to eliminate this ambiguity. We additionally simplify the functions, eliminating unnecessary bitmap_xxx() operations (the compiler would have optimised these out but it's worth being as clear as we can be here). Link: https://lkml.kernel.org/r/8f0bc556e1b90eca8ea5eba41f8d5d3f9cd7c98a.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 +++++--------- kernel/fork.c | 4 ++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4f66a3206a63..3550672e0f9e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1314,15 +1314,13 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; -/* Set the first system word of mm flags, non-atomically. */ -static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +/* Copy value to the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); - - bitmap_copy(bitmap, &value, BITS_PER_LONG); + *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value; } -/* Obtain a read-only view of the bitmap. */ +/* Obtain a read-only view of the mm flags bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); @@ -1331,9 +1329,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) { - const unsigned long *bitmap = __mm_flags_get_bitmap(mm); - - return bitmap_read(bitmap, 0, BITS_PER_LONG); + return *__mm_flags_get_bitmap(mm); } /* diff --git a/kernel/fork.c b/kernel/fork.c index dd0bb5fe4305..5e3309a2332c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1061,10 +1061,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (current->mm) { unsigned long flags = __mm_flags_get_word(current->mm); - __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); + __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - __mm_flags_set_word(mm, default_dump_filter); + __mm_flags_overwrite_word(mm, default_dump_filter); mm->def_flags = 0; } From 4c613f518f786fb0ca4850e4ca5f1933d6a4a304 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:01 +0000 Subject: [PATCH 312/321] tools/testing/vma: eliminate dependency on vma->__vm_flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The userland VMA test code relied on an internal implementation detail - the existence of vma->__vm_flags to directly access VMA flags. There is no need to do so when we have the vm_flags_*() helper functions available. This is ugly, but also a subsequent commit will eliminate this field altogether so this will shortly become broken. This patch has us utilise the helper functions instead. Link: https://lkml.kernel.org/r/6275c53a6bb20743edcbe92d3e130183b47d18d0.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index be79ab2ea44b..93d21bc7e112 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -69,18 +69,18 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, pgoff_t pgoff, vm_flags_t vm_flags) { - struct vm_area_struct *ret = vm_area_alloc(mm); + struct vm_area_struct *vma = vm_area_alloc(mm); - if (ret == NULL) + if (vma == NULL) return NULL; - ret->vm_start = start; - ret->vm_end = end; - ret->vm_pgoff = pgoff; - ret->__vm_flags = vm_flags; - vma_assert_detached(ret); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vm_flags_reset(vma, vm_flags); + vma_assert_detached(vma); - return ret; + return vma; } /* Helper function to allocate a VMA and link it to the tree. */ @@ -714,7 +714,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); @@ -736,7 +736,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); From 9ea35a25d51b13013b724943a177a7aaf4bfed71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:02 +0000 Subject: [PATCH 313/321] mm: introduce VMA flags bitmap type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useful to transition to using a bitmap for VMA flags so we can avoid running out of flags, especially for 32-bit kernels which are constrained to 32 flags, necessitating some features to be limited to 64-bit kernels only. By doing so, we remove any constraint on the number of VMA flags moving forwards no matter the platform and can decide in future to extend beyond 64 if required. We start by declaring an opaque types, vma_flags_t (which resembles mm_struct flags of type mm_flags_t), setting it to precisely the same size as vm_flags_t, and place it in union with vm_flags in the VMA declaration. We additionally update struct vm_area_desc equivalently placing the new opaque type in union with vm_flags. This change therefore does not impact the size of struct vm_area_struct or struct vm_area_desc. In order for the change to be iterative and to avoid impacting performance, we designate VM_xxx declared bitmap flag values as those which must exist in the first system word of the VMA flags bitmap. We therefore declare vma_flags_clear_all(), vma_flags_overwrite_word(), vma_flags_overwrite_word(), vma_flags_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in order to allow us to update the existing vm_flags_*() functions to utilise these helpers. This is a stepping stone towards converting users to the VMA flags bitmap and behaves precisely as before. By doing this, we can eliminate the existing private vma->__vm_flags field in the vma->vm_flags union and replace it with the newly introduced opaque type vma_flags, which we call flags so we refer to the new bitmap field as vma->flags. We update vma_flag_[test, set]_atomic() to account for the change also. We adapt vm_flags_reset_once() to only clear those bits above the first system word providing write-once semantics to the first system word (which it is presumed the caller requires - and in all current use cases this is so). As we currently only specify that the VMA flags bitmap size is equal to BITS_PER_LONG number of bits, this is a noop, but is defensive in preparation for a future change that increases this. We additionally update the VMA userland test declarations to implement the same changes there. Finally, we update the rust code to reference vma->vm_flags on update rather than vma->__vm_flags which has been removed. This is safe for now, albeit it is implicitly performing a const cast. Once we introduce flag helpers we can improve this more. No functional change intended. Link: https://lkml.kernel.org/r/bab179d7b153ac12f221b7d65caac2759282cfe9.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 +++-- include/linux/mm_types.h | 64 ++++++++++++- rust/kernel/mm/virt.rs | 2 +- tools/testing/vma/vma_internal.h | 150 ++++++++++++++++++++++++------- 4 files changed, 202 insertions(+), 38 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a2f38fb68840..2887d3b34d3e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -911,7 +911,8 @@ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); - ACCESS_PRIVATE(vma, __vm_flags) = flags; + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); } /* @@ -931,14 +932,25 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); - WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + /* + * If VMA flags exist beyond the first system word, also clear these. It + * is assumed the write once behaviour is required only for the first + * system word. + */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } + + vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) |= flags; + vma_flags_set_word(&vma->flags, flags); } static inline void vm_flags_clear(struct vm_area_struct *vma, @@ -946,7 +958,7 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, { VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + vma_flags_clear_word(&vma->flags, flags); } /* @@ -989,12 +1001,14 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, static inline void vma_flag_set_atomic(struct vm_area_struct *vma, vma_flag_t bit) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + /* mmap read lock/VMA read lock must be held. */ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); if (__vma_flag_atomic_valid(vma, bit)) - set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags)); + set_bit((__force int)bit, bitmap); } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3550672e0f9e..b71625378ce3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -848,6 +848,15 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -865,7 +874,10 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ @@ -910,10 +922,12 @@ struct vm_area_struct { /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. + * Preferably, use vma_flags_xxx() functions. */ union { + /* Temporary while VMA flags are being converted. */ const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -994,6 +1008,52 @@ struct vm_area_struct { #endif } __randomize_layout; +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index a1bfa4e19293..da21d65ccd20 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -250,7 +250,7 @@ impl VmaNew { // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel. // The caller promises that this does not set the flags to an invalid value. - unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags }; + unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags = flags }; } /// Set the `VM_MIXEDMAP` flag on this vma. diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index b7e8fc9ccdf4..9f0a9f5ed0fe 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -524,6 +524,15 @@ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -608,7 +617,10 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ @@ -654,7 +666,7 @@ struct vm_area_struct { */ union { const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -1368,26 +1380,6 @@ static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, return true; } -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma->__vm_flags = flags; -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags |= flags; -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags &= ~flags; -} - static inline int shmem_zero_setup(struct vm_area_struct *vma) { return 0; @@ -1544,13 +1536,118 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -# define ACCESS_PRIVATE(p, member) ((p)->member) +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +{ + unsigned int len = bitmap_size(nbits); + + if (small_const_nbits(nbits)) + *dst = 0; + else + memset(dst, 0, len); +} static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1763,11 +1860,4 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, return 0; } -static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) -{ - vm_flags_t *dst = (vm_flags_t *)(&vma->vm_flags); - - *dst = flags; -} - #endif /* __MM_VMA_INTERNAL_H */ From f65372cd7acbe3c4980d404e99a7017afed607b4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 25 Nov 2025 09:19:30 +0100 Subject: [PATCH 314/321] mm: fix DEBUG_RODATA_TEST indentation in Kconfig Most of the DEBUG_RODATA_TEST section is indented by four spaces instead of the customary single TAB. Link: https://lkml.kernel.org/r/74f39b1bffc6ed802088cb3e7d17b4c82330e8b3.1764058676.git.geert@linux-m68k.org Fixes: 2959a5f726f6 ("mm: add arch-independent testcases for RODATA") Signed-off-by: Geert Uytterhoeven Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Anshuman Khandual Cc: Jinbum Park Cc: Kees Cook Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/Kconfig.debug | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 32b65073d0cc..7638d75b27db 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -175,10 +175,10 @@ config DEBUG_PAGE_REF nil until the tracepoints are actually enabled. config DEBUG_RODATA_TEST - bool "Testcase for the marking rodata read-only" - depends on STRICT_KERNEL_RWX + bool "Testcase for the marking rodata read-only" + depends on STRICT_KERNEL_RWX help - This option enables a testcase for the setting rodata read-only. + This option enables a testcase for the setting rodata read-only. config ARCH_HAS_DEBUG_WX bool From 0384c8ea96bfe49e82e624e53bfd5f80c3230ea9 Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Wed, 26 Nov 2025 21:38:30 +0530 Subject: [PATCH 315/321] selftests/mm/uffd: initialize char variable to Null In "uffd-stress.c" & "uffd-unit-tests.c". address of char variable having garbage value (uninitialized) is passed to 'write' syscall triggers warning. uffd-stress.c:246:39: warning: variable 'c' is uninitialized when passed as a const pointer argument here [-Wuninitialized-const-pointer] uffd-unit-tests.c:581:31: warning: variable 'c' is uninitialized when passed as a const pointer argument here [-Wuninitialized-const-pointer] so the fix is to assign char variable to '\0' to prevent writing of garbage value. Link: https://lkml.kernel.org/r/20251126160830.52124-1-ankitkhushwaha.linux@gmail.com Signed-off-by: Ankit Khushwaha Reviewed-by: Mike Rapoport (Microsoft) Cc: Bill Wendling Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Nathan Chancellor Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 2 +- tools/testing/selftests/mm/uffd-unit-tests.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index b51c89e1cd1a..700fbaa18d44 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -241,7 +241,7 @@ static int stress(struct uffd_args *args) return 1; for (cpu = 0; cpu < gopts->nr_parallel; cpu++) { - char c; + char c = '\0'; if (bounces & BOUNCE_POLL) { if (write(gopts->pipefd[cpu*2+1], &c, 1) != 1) err("pipefd write error"); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index f917b4c4c943..f4807242c5b2 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -543,7 +543,7 @@ static void uffd_minor_test_common(uffd_global_test_opts_t *gopts, bool test_col { unsigned long p; pthread_t uffd_mon; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -759,7 +759,7 @@ static void uffd_sigbus_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -819,7 +819,7 @@ static void uffd_events_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -1125,7 +1125,7 @@ uffd_move_test_common(uffd_global_test_opts_t *gopts, { unsigned long nr; pthread_t uffd_mon; - char c; + char c = '\0'; unsigned long long count; struct uffd_args args = { 0 }; char *orig_area_src = NULL, *orig_area_dst = NULL; From f3b566d726357df591602f195a9379494f005225 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 26 Nov 2025 02:04:35 +0000 Subject: [PATCH 316/321] memcg: remove inc/dec_lruvec_kmem_state helpers The dec_lruvec_kmem_state helper is unused by any caller and can be safely removed. Meanwhile, the inc_lruvec_kmem_state helper is only referenced by shadow_lru_isolate, retaining these two helpers is unnecessary. This patch removes both helper functions to eliminate redundant code. Link: https://lkml.kernel.org/r/20251126020435.1511637-1-chenridong@huaweicloud.com Signed-off-by: Chen Ridong Acked-by: Qi Zheng Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Lu Jialin Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ---------- mm/workingset.c | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d35390f9892a..0651865a4564 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1452,16 +1452,6 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - mod_lruvec_kmem_state(p, idx, 1); -} - -static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - mod_lruvec_kmem_state(p, idx, -1); -} - static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 6ff30369b758..1399d6da75a2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); out_invalid: xa_unlock_irq(&mapping->i_pages); From ce2bba89566bef9d4a0ff2122ee75739a72a92be Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 26 Nov 2025 09:46:18 -0800 Subject: [PATCH 317/321] mm/kfence: add reboot notifier to disable KFENCE on shutdown During system shutdown, KFENCE can cause IPI synchronization issues if it remains active through the reboot process. To prevent this, register a reboot notifier that disables KFENCE and cancels any pending timer work early in the shutdown sequence. This is only necessary when CONFIG_KFENCE_STATIC_KEYS is enabled, as this configuration sends IPIs that can interfere with shutdown. Without static keys, no IPIs are generated and KFENCE can safely remain active. The notifier uses maximum priority (INT_MAX) to ensure KFENCE shuts down before other subsystems that might still depend on stable memory allocation behavior. This fixes a late kexec CSD lockup[1] when kfence is trying to IPI a CPU that is busy in a IRQ-disabled context printing characters to the console. Link: https://lkml.kernel.org/r/20251127-kfence-v2-1-daeccb5ef9aa@debian.org Link: https://lkml.kernel.org/r/20251126-kfence-v1-1-5a6e1d7c681c@debian.org Link: https://lore.kernel.org/all/sqwajvt7utnt463tzxgwu2yctyn5m6bjwrslsnupfexeml6hkd@v6sqmpbu3vvu/ [1] Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Breno Leitao Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitriy Vyukov Signed-off-by: Andrew Morton --- mm/kfence/core.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 727c20c94ac5..162a026871ab 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -820,6 +821,25 @@ static struct notifier_block kfence_check_canary_notifier = { static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS +static int kfence_reboot_callback(struct notifier_block *nb, + unsigned long action, void *data) +{ + /* + * Disable kfence to avoid static keys IPI synchronization during + * late shutdown/kexec + */ + WRITE_ONCE(kfence_enabled, false); + /* Cancel any pending timer work */ + cancel_delayed_work_sync(&kfence_timer); + + return NOTIFY_OK; +} + +static struct notifier_block kfence_reboot_notifier = { + .notifier_call = kfence_reboot_callback, + .priority = INT_MAX, /* Run early to stop timers ASAP */ +}; + /* Wait queue to wake up allocation-gate timer task. */ static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); @@ -901,6 +921,10 @@ static void kfence_init_enable(void) if (kfence_check_on_panic) atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); +#ifdef CONFIG_KFENCE_STATIC_KEYS + register_reboot_notifier(&kfence_reboot_notifier); +#endif + WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); From 12f0cd393369d700c16b47bc33e4120dc8b2c608 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 27 Nov 2025 17:45:17 +0000 Subject: [PATCH 318/321] fs/proc/task_mmu.c: fix make_uffd_wp_huge_pte() huge pte handling make_uffd_wp_huge_pte() should return after handling a huge_pte_none() pte. Link: https://lkml.kernel.org/r/66178124-ebdf-4e23-b8ca-ed3eb8030c81@lucifer.local Fixes: 03bfbc3ad6e4 ("mm: remove is_hugetlb_entry_[migration, hwpoisoned]()") Signed-off-by: Lorenzo Stoakes Reported-by: Vlastimil Babka Closes: https://lkml.kernel.org/r/dc483db3-be4d-45f7-8b40-a28f5d8f5738@suse.cz Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d00ac179d973..81dfc26bfae8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2500,9 +2500,11 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, const unsigned long psize = huge_page_size(hstate_vma(vma)); softleaf_t entry; - if (huge_pte_none(ptent)) + if (huge_pte_none(ptent)) { set_huge_pte_at(vma->vm_mm, addr, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); + return; + } entry = softleaf_from_pte(ptent); if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) From f9e82f99b3771eef396dbf97e0f3c76e20af60dd Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Thu, 27 Nov 2025 19:03:02 +0900 Subject: [PATCH 319/321] mm/swapfile: fix list iteration when next node is removed during discard Patch series "mm/swapfile: fix and cleanup swap list iterations", v2. This series fixes a potential list iteration issue in swap_sync_discard() when devices are removed, and includes a cleanup for __folio_throttle_swaprate(). This patch (of 2): When the next node is removed from the plist (e.g. by swapoff), plist_del() makes the node point to itself, causing the iteration to loop on the same entry indefinitely. Add a plist_node_empty() check to detect this case and restart iteration, allowing swap_sync_discard() to continue processing remaining swap devices that still have pending discard entries. Additionally, switch from swap_avail_lock/swap_avail_head to swap_lock/swap_active_head so that iteration is only affected by swapoff operations rather than frequent availability changes, reducing exceptional condition checks and lock contention. Link: https://lkml.kernel.org/r/20251127100303.783198-1-youngjun.park@lge.com Link: https://lkml.kernel.org/r/20251127100303.783198-2-youngjun.park@lge.com Fixes: 686ea517f471 ("mm, swap: do not perform synchronous discard during allocation") Signed-off-by: Youngjun Park Suggested-by: Kairui Song Acked-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d12332423a06..8116f36e440b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1387,9 +1387,10 @@ static bool swap_sync_discard(void) bool ret = false; struct swap_info_struct *si, *next; - spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { - spin_unlock(&swap_avail_lock); + spin_lock(&swap_lock); +start_over: + plist_for_each_entry_safe(si, next, &swap_active_head, list) { + spin_unlock(&swap_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) ret = swap_do_scheduled_discard(si); @@ -1397,9 +1398,12 @@ static bool swap_sync_discard(void) } if (ret) return true; - spin_lock(&swap_avail_lock); + + spin_lock(&swap_lock); + if (plist_node_empty(&next->list)) + goto start_over; } - spin_unlock(&swap_avail_lock); + spin_unlock(&swap_lock); return false; } From b60a3ef7848dbcc9549137f8d42dcd82210c0b66 Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Thu, 27 Nov 2025 19:03:03 +0900 Subject: [PATCH 320/321] mm/swapfile: use plist_for_each_entry in __folio_throttle_swaprate The loop breaks immediately after finding the first swap device and never modifies the list. Replace plist_for_each_entry_safe() with plist_for_each_entry() and remove the unused next variable. Link: https://lkml.kernel.org/r/20251127100303.783198-3-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Baoquan He Acked-by: Kairui Song Acked-by: Chris Li Cc: Barry Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 8116f36e440b..46d2008e4b99 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -4023,7 +4023,7 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { - struct swap_info_struct *si, *next; + struct swap_info_struct *si; if (!(gfp & __GFP_IO)) return; @@ -4042,8 +4042,7 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_head, - avail_list) { + plist_for_each_entry(si, &swap_avail_head, avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; From faf3c923523e5c8fc3baaa413d62e913774ae52f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 28 Nov 2025 04:00:58 +0000 Subject: [PATCH 321/321] mm: fix vma_start_write_killable() signal handling If we get a signal, we need to restore the vm_refcnt. We don't think that the refcount can actually be decremented to zero here as it requires the VMA to be detached, and the vma_mark_detached() uses TASK_UNINTERRUPTIBLE. However, that's a bit subtle, so handle it as if the refcount was zero at the start of this function. Link: https://lkml.kernel.org/r/20251128040100.3022561-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: syzbot+5b19bad23ac7f44bf8b8@syzkaller.appspotmail.com Fixes: 2197bb60f890 ("mm: add vma_start_write_killable()") Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap_lock.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index e6e5570d1ec7..7421b7ea8001 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -74,6 +74,14 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); if (err) { + if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { + /* + * The wait failed, but the last reader went away + * as well. Tell the caller the VMA is detached. + */ + WARN_ON_ONCE(!detaching); + err = 0; + } rwsem_release(&vma->vmlock_dep_map, _RET_IP_); return err; }