Files
linux/mm/vma_exec.c
Chunyan Zhang 277a1ae387 mm: softdirty: add pgtable_supports_soft_dirty()
Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15.

This patchset adds support for Svrsw60t59b [1] extension which is ratified
now, also add soft dirty and userfaultfd write protect tracking for
RISC-V.

The patches 1 and 2 add macros to allow architectures to define their own
checks if the soft-dirty / uffd_wp PTE bits are available, in other words
for RISC-V, the Svrsw60t59b extension is supported on which device the
kernel is running.  Also patch1-2 are removing "ifdef
CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef
CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by
the architecture, no change in behavior is expected.

This patchset has been tested with kselftest mm suite in which soft-dirty,
madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and
no regressions are observed in any of the other tests.


This patch (of 6):

Some platforms can customize the PTE PMD entry soft-dirty bit making it
unavailable even if the architecture provides the resource.

Add an API which architectures can define their specific implementations
to detect if soft-dirty bit is available on which device the kernel is
running.

This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of
pgtable_supports_soft_dirty() checks that defaults to
IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture,
no change in behavior is expected.

We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(),
so we will never run into VM_SOFTDIRTY checks.

[lorenzo.stoakes@oracle.com: fix VMA selftests]
  Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local
Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn
Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn
Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1]
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor@kernel.org>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00

165 lines
4.4 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Functions explicitly implemented for exec functionality which however are
* explicitly VMA-only logic.
*/
#include "vma_internal.h"
#include "vma.h"
/*
* Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
* this VMA and its relocated range, which will now reside at [vma->vm_start -
* shift, vma->vm_end - shift).
*
* This function is almost certainly NOT what you want for anything other than
* early executable temporary stack relocation.
*/
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
{
/*
* The process proceeds as follows:
*
* 1) Use shift to calculate the new vma endpoints.
* 2) Extend vma to cover both the old and new ranges. This ensures the
* arguments passed to subsequent functions are consistent.
* 3) Move vma's page tables to the new range.
* 4) Free up any cleared pgd range.
* 5) Shrink the vma to cover only the new range.
*/
struct mm_struct *mm = vma->vm_mm;
unsigned long old_start = vma->vm_start;
unsigned long old_end = vma->vm_end;
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
VMA_ITERATOR(vmi, mm, new_start);
VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
struct vm_area_struct *next;
struct mmu_gather tlb;
PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
BUG_ON(new_start > new_end);
/*
* ensure there are no vmas between where we want to go
* and where we are
*/
if (vma != vma_next(&vmi))
return -EFAULT;
vma_iter_prev_range(&vmi);
/*
* cover the whole range: [new_start, old_end)
*/
vmg.target = vma;
if (vma_expand(&vmg))
return -ENOMEM;
/*
* move the page tables downwards, on failure we rely on
* process cleanup to remove whatever mess we made.
*/
pmc.for_stack = true;
if (length != move_page_tables(&pmc))
return -ENOMEM;
tlb_gather_mmu(&tlb, mm);
next = vma_next(&vmi);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
next ? next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
* the address space in [new_end, old_start) some architectures
* have constraints on va-space that make this illegal (IA64) -
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
next ? next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb);
vma_prev(&vmi);
/* Shrink the vma to just the new range */
return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
}
/*
* Establish the stack VMA in an execve'd process, located temporarily at the
* maximum stack address provided by the architecture.
*
* We later relocate this downwards in relocate_vma_down().
*
* This function is almost certainly NOT what you want for anything other than
* early executable initialisation.
*
* On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
* maximum addressable location in the stack (that is capable of storing a
* system word of data).
*/
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
unsigned long *top_mem_p)
{
unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
int err;
struct vm_area_struct *vma = vm_area_alloc(mm);
if (!vma)
return -ENOMEM;
vma_set_anonymous(vma);
if (mmap_write_lock_killable(mm)) {
err = -EINTR;
goto err_free;
}
/*
* Need to be called with mmap write lock
* held, to avoid race with ksmd.
*/
err = ksm_execve(mm);
if (err)
goto err_ksm;
/*
* Place the stack at the largest stack address the architecture
* supports. Later, we'll move this to an appropriate place. We don't
* use STACK_TOP because that can depend on attributes which aren't
* configured yet.
*/
BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE;
if (pgtable_supports_soft_dirty())
flags |= VM_SOFTDIRTY;
vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
err = insert_vm_struct(mm, vma);
if (err)
goto err;
mm->stack_vm = mm->total_vm = 1;
mmap_write_unlock(mm);
*vmap = vma;
*top_mem_p = vma->vm_end - sizeof(void *);
return 0;
err:
ksm_exit(mm);
err_ksm:
mmap_write_unlock(mm);
err_free:
*vmap = NULL;
vm_area_free(vma);
return err;
}