From 969493d7d6f957e1670a025330561d5912cd194c Mon Sep 17 00:00:00 2001 From: Zhou Yuhang Date: Mon, 29 Sep 2025 14:24:34 +0800 Subject: [PATCH 001/139] samples: fix coding style issues in Kconfig Fix some coding style issues in Kconfig: use one tab to indent lines under a config definition, and use an additional two spaces to indent help text. Link: https://lkml.kernel.org/r/20250929062434.4114607-1-zhouyuhang1010@163.com Signed-off-by: Zhou Yuhang Signed-off-by: Andrew Morton --- samples/Kconfig | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 6e072a5f1ed8..5bc7c9e5a59e 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -23,11 +23,11 @@ config SAMPLE_TRACE_CUSTOM_EVENTS This builds the custom trace event example module. config SAMPLE_TRACE_PRINTK - tristate "Build trace_printk module - tests various trace_printk formats" + tristate "Build trace_printk module - tests various trace_printk formats" depends on EVENT_TRACING && m help - This builds a module that calls trace_printk() and can be used to - test various trace_printk() calls from a module. + This builds a module that calls trace_printk() and can be used to + test various trace_printk() calls from a module. config SAMPLE_FTRACE_DIRECT tristate "Build register_ftrace_direct() example" @@ -54,11 +54,11 @@ config SAMPLE_FTRACE_OPS measures the time taken to invoke one function a number of times. config SAMPLE_TRACE_ARRAY - tristate "Build sample module for kernel access to Ftrace instances" + tristate "Build sample module for kernel access to Ftrace instances" depends on EVENT_TRACING && m help - This builds a module that demonstrates the use of various APIs to - access Ftrace instances from within the kernel. + This builds a module that demonstrates the use of various APIs to + access Ftrace instances from within the kernel. config SAMPLE_KOBJECT tristate "Build kobject examples" @@ -290,11 +290,11 @@ config SAMPLE_CORESIGHT_SYSCFG configurations and easily load them into the system at runtime. config SAMPLE_KMEMLEAK - tristate "Simple test for the kernel memory leak detector" - depends on DEBUG_KMEMLEAK && m - help - Build a sample program which have explicitly leaks memory to test - kmemleak + tristate "Simple test for the kernel memory leak detector" + depends on DEBUG_KMEMLEAK && m + help + Build a sample program which have explicitly leaks memory to test + kmemleak. config SAMPLE_CGROUP bool "Build cgroup sample code" From fc387a0704cc86cf47de0e64236577af3e148ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=96zkan?= Date: Wed, 17 Sep 2025 20:37:24 +0300 Subject: [PATCH 002/139] checkpatch: detect unhandled placeholders in cover letters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new check PLACEHOLDER_USE to detect unhandled placeholders. This prevents sending patch series with incomplete patches (mostly in cover letters) containing auto generated subject or blurb lines. These placeholders can be seen on mailing lists. With this change, checkpatch will emit an error when such text is found. Link: https://lkml.kernel.org/r/20250917173725.22547-2-work@onurozkan.dev Signed-off-by: Onur Özkan Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Jonathan Corbet Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 92669904eecc..6729f18e5654 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3345,6 +3345,13 @@ sub process { } } +# Check for auto-generated unhandled placeholder text (mostly for cover letters) + if (($in_commit_log || $in_header_lines) && + $rawline =~ /(?:SUBJECT|BLURB) HERE/) { + ERROR("PLACEHOLDER_USE", + "Placeholder text detected\n" . $herecurr); + } + # Check for git id commit length and improperly formed commit descriptions # A correctly formed commit description is: # commit ("Complete commit subject") From 55b453ed53d28da0b8d2316741e11eb22ceb39d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=96zkan?= Date: Wed, 17 Sep 2025 20:37:25 +0300 Subject: [PATCH 003/139] checkpatch: document new check PLACEHOLDER_USE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds documentation for the new check PLACEHOLDER_USE in checkpatch. Link: https://lkml.kernel.org/r/20250917173725.22547-3-work@onurozkan.dev Signed-off-by: Onur Özkan Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Jonathan Corbet Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- Documentation/dev-tools/checkpatch.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index d5c47e560324..4dccd1036870 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -1245,6 +1245,16 @@ Others The patch file does not appear to be in unified-diff format. Please regenerate the patch file before sending it to the maintainer. + **PLACEHOLDER_USE** + Detects unhandled placeholder text left in cover letters or commit headers/logs. + Common placeholders include lines like:: + + *** SUBJECT HERE *** + *** BLURB HERE *** + + These typically come from autogenerated templates. Replace them with a proper + subject and description before sending. + **PRINTF_0XDECIMAL** Prefixing 0x with decimal output is defective and should be corrected. From 41a5e8777093f7946fade3737363ef045303b8dc Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 9 Oct 2025 13:23:48 +0300 Subject: [PATCH 004/139] ocfs2: add extra flags check in ocfs2_ioctl_move_extents() In 'ocfs2_ioctl_move_extents()', add extra check whether only actually supported flags are passed via 'ioctl(..., OCFS2_IOC_MOVE_EXT, ...)', and reject anything beyond OCFS2_MOVE_EXT_FL_AUTO_DEFRAG and OCFS2_MOVE_EXT_FL_PART_DEFRAG with -EINVAL. In particular, OCFS2_MOVE_EXT_FL_COMPLETE may be set by the kernel only and should never be passed from userspace. Link: https://lkml.kernel.org/r/20251009102349.181126-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/move_extents.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 10923bf7c8b8..bc83dc5ab5bd 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1036,6 +1036,12 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) if (range.me_threshold > i_size_read(inode)) range.me_threshold = i_size_read(inode); + if (range.me_flags & ~(OCFS2_MOVE_EXT_FL_AUTO_DEFRAG | + OCFS2_MOVE_EXT_FL_PART_DEFRAG)) { + status = -EINVAL; + goto out_free; + } + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { context->auto_defrag = 1; From 8a7d58845fae061c62b50bc5eeb9bae4a1dedc3d Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 9 Oct 2025 13:23:49 +0300 Subject: [PATCH 005/139] ocfs2: relax BUG() to ocfs2_error() in __ocfs2_move_extent() In '__ocfs2_move_extent()', relax 'BUG()' to 'ocfs2_error()' just to avoid crashing the whole kernel due to a filesystem corruption. Fixes: 8f603e567aa7 ("Ocfs2/move_extents: move a range of extent.") Link: https://lkml.kernel.org/r/20251009102349.181126-2-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Closes: https://syzkaller.appspot.com/bug?extid=727d161855d11d81e411 Reported-by: syzbot+727d161855d11d81e411@syzkaller.appspotmail.com Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/move_extents.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index bc83dc5ab5bd..ce978a2497d9 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,7 +98,13 @@ static int __ocfs2_move_extent(handle_t *handle, rec = &el->l_recs[index]; - BUG_ON(ext_flags != rec->e_flags); + if (ext_flags != rec->e_flags) { + ret = ocfs2_error(inode->i_sb, + "Inode %llu has corrupted extent %d with flags 0x%x at cpos %u\n", + (unsigned long long)ino, index, rec->e_flags, cpos); + goto out; + } + /* * after moving/defraging to new location, the extent is not going * to be refcounted anymore. From 6a2e57ad227ac21cbe0ed941dbedd3b81b22ce7e Mon Sep 17 00:00:00 2001 From: Justinien Bouron Date: Mon, 29 Sep 2025 09:02:20 -0700 Subject: [PATCH 006/139] kexec_core: remove superfluous page offset handling in segment loading During kexec_segment loading, when copying the content of the segment (i.e. kexec_segment::kbuf or kexec_segment::buf) to its associated pages, kimage_load_{cma,normal,crash}_segment handle the case where the physical address of the segment is not page aligned, e.g. in kimage_load_normal_segment: page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); // ... ptr = kmap_local_page(page); // ... ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); // ^^^^ Non page-aligned segments handled here ^^^ // ... if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); (similar logic is present in kimage_load_{cma,crash}_segment). This is actually not needed because, prior to their loading, all kexec_segments first go through a vetting step in `sanity_check_segment_list`, which rejects any segment that is not page-aligned: for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; // ... if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; // ... } In case `sanity_check_segment_list` finds a non-page aligned the whole kexec load is aborted and no segment is loaded. This means that `kimage_load_{cma,normal,crash}_segment` never actually have to handle non page-aligned segments and `(maddr & ~PAGE_MASK) == 0` is always true no matter if the segment is coming from a file (i.e. `kexec_file_load` syscall), from a user-space buffer (i.e. `kexec_load` syscall) or created by the kernel through `kexec_add_buffer`. In the latter case, `kexec_add_buffer` actually enforces the page alignment: /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); [jbouron@amazon.com: v3] Link: https://lkml.kernel.org/r/20251024155009.39502-1-jbouron@amazon.com Link: https://lkml.kernel.org/r/20250929160220.47616-1-jbouron@amazon.com Signed-off-by: Justinien Bouron Reviewed-by: Gunnar Kudrjavets Reviewed-by: Andy Shevchenko Acked-by: Baoquan He Cc: Alexander Graf Cc: Marcos Paulo de Souza Cc: Mario Limonciello Cc: Petr Mladek Cc: Yan Zhao Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fa00b239c5d9..5ed7a2383d5d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -742,7 +742,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) struct kexec_segment *segment = &image->segment[idx]; struct page *cma = image->segment_cma[idx]; char *ptr = page_address(cma); - unsigned long maddr; size_t ubytes, mbytes; int result = 0; unsigned char __user *buf = NULL; @@ -754,15 +753,12 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; - maddr = segment->mem; /* Then copy from source buffer to the CMA one */ while (mbytes) { size_t uchunk, mchunk; - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -784,7 +780,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) } ptr += mchunk; - maddr += mchunk; mbytes -= mchunk; cond_resched(); @@ -839,9 +834,7 @@ static int kimage_load_normal_segment(struct kimage *image, int idx) ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -904,9 +897,7 @@ static int kimage_load_crash_segment(struct kimage *image, int idx) } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ From 08bd4c46d5e63b78e77f2605283874bbe868ab19 Mon Sep 17 00:00:00 2001 From: Zhichi Lin Date: Sat, 11 Oct 2025 16:22:22 +0800 Subject: [PATCH 007/139] scs: fix a wrong parameter in __scs_magic __scs_magic() needs a 'void *' variable, but a 'struct task_struct *' is given. 'task_scs(tsk)' is the starting address of the task's shadow call stack, and '__scs_magic(task_scs(tsk))' is the end address of the task's shadow call stack. Here should be '__scs_magic(task_scs(tsk))'. The user-visible effect of this bug is that when CONFIG_DEBUG_STACK_USAGE is enabled, the shadow call stack usage checking function (scs_check_usage) would scan an incorrect memory range. This could lead to: 1. **Inaccurate stack usage reporting**: The function would calculate wrong usage statistics for the shadow call stack, potentially showing incorrect value in kmsg. 2. **Potential kernel crash**: If the value of __scs_magic(tsk)is greater than that of __scs_magic(task_scs(tsk)), the for loop may access unmapped memory, potentially causing a kernel panic. However, this scenario is unlikely because task_struct is allocated via the slab allocator (which typically returns lower addresses), while the shadow call stack returned by task_scs(tsk) is allocated via vmalloc(which typically returns higher addresses). However, since this is purely a debugging feature (CONFIG_DEBUG_STACK_USAGE), normal production systems should be not unaffected. The bug only impacts developers and testers who are actively debugging stack usage with this configuration enabled. Link: https://lkml.kernel.org/r/20251011082222.12965-1-zhichi.lin@vivo.com Fixes: 5bbaf9d1fcb9 ("scs: Add support for stack usage debugging") Signed-off-by: Jiyuan Xie Signed-off-by: Zhichi Lin Reviewed-by: Sami Tolvanen Acked-by: Will Deacon Cc: Andrey Konovalov Cc: Kees Cook Cc: Marco Elver Cc: Will Deacon Cc: Yee Lee Cc: Signed-off-by: Andrew Morton --- kernel/scs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/scs.c b/kernel/scs.c index d7809affe740..772488afd5b9 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -135,7 +135,7 @@ static void scs_check_usage(struct task_struct *tsk) if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) return; - for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) { if (!READ_ONCE_NOCHECK(*p)) break; used += sizeof(*p); From b7d06a2ae10a824f583dea85fe6e9875422eca01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kepplinger-Novakovi=C4=87?= Date: Sat, 11 Oct 2025 15:59:39 +0000 Subject: [PATCH 008/139] mailmap: update name and email addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply my new surname, remove unused and update to currently used email addresses. Link: https://lkml.kernel.org/r/20251011155903.7442-1-martink@posteo.de Signed-off-by: Martin Kepplinger-Novaković Signed-off-by: Andrew Morton --- .mailmap | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 369cfe467932..24ddc20f3b46 100644 --- a/.mailmap +++ b/.mailmap @@ -496,9 +496,7 @@ Mark Brown Mark Starovoytov Markus Schneider-Pargmann Mark Yao -Martin Kepplinger -Martin Kepplinger -Martin Kepplinger +Martin Kepplinger-Novakovic Martyna Szapar-Mudlaw Mathieu Othacehe Mat Martineau From 0d63fc256ad1e8b64bbb32f97232c5f2aee046c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kepplinger-Novakovi=C4=87?= Date: Sat, 11 Oct 2025 15:59:39 +0000 Subject: [PATCH 009/139] CREDITS: update Martin's information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surname, address, email and the description changed. Apply these updates. Link: https://lkml.kernel.org/r/20251011155903.7442-3-martink@posteo.de Signed-off-by: Martin Kepplinger-Novaković Signed-off-by: Andrew Morton --- CREDITS | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/CREDITS b/CREDITS index fa5397f4ebcd..85bdc8828734 100644 --- a/CREDITS +++ b/CREDITS @@ -2056,16 +2056,15 @@ S: Korte Heul 95 S: 1403 ND BUSSUM S: The Netherlands -N: Martin Kepplinger +N: Martin Kepplinger-Novakovic E: martink@posteo.de -E: martin.kepplinger@puri.sm -W: http://www.martinkepplinger.com P: 4096R/5AB387D3 F208 2B88 0F9E 4239 3468 6E3F 5003 98DF 5AB3 87D3 D: mma8452 accelerators iio driver D: pegasus_notetaker input driver +D: imx8m media and hi846 sensor driver D: Kernel fixes and cleanups -S: Garnisonstraße 26 -S: 4020 Linz +S: Keplerstr. 6 +S: 4050 Traun S: Austria N: Karl Keyte From 02582ac3b7d2decc578bd3cef90db95c57031a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kepplinger-Novakovi=C4=87?= Date: Sat, 11 Oct 2025 15:59:39 +0000 Subject: [PATCH 010/139] MAINTAINERS: apply name and email address changes for Martin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update to new surname addition and currently used email address. Link: https://lkml.kernel.org/r/20251011155903.7442-2-martink@posteo.de Signed-off-by: Martin Kepplinger-Novaković Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ddecf1ef3bed..9a42edbe2033 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11581,7 +11581,7 @@ T: git git://linuxtv.org/media.git F: drivers/media/i2c/hi556.c HYNIX HI846 SENSOR DRIVER -M: Martin Kepplinger +M: Martin Kepplinger-Novakovic L: linux-media@vger.kernel.org S: Maintained F: drivers/media/i2c/hi846.c @@ -15553,7 +15553,7 @@ F: include/media/imx.h MEDIA DRIVERS FOR FREESCALE IMX7/8 M: Rui Miguel Silva M: Laurent Pinchart -M: Martin Kepplinger +M: Martin Kepplinger-Novakovic R: Purism Kernel Team L: linux-media@vger.kernel.org S: Maintained From cd4eaccc00d79ab97d9a96f7922558558b13f220 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 10 Oct 2025 10:21:38 +0200 Subject: [PATCH 011/139] treewide: drop outdated compiler version remarks in Kconfig help texts As of writing, Documentation/Changes states the minimal versions of GNU C being 8.1, Clang being 15.0.0 and binutils being 2.30. A few Kconfig help texts are pointing out that specific GCC and Clang versions are needed, but by now, those pointers to versions, such later than 4.0, later than 4.4, or clang later than 5.0, are obsolete and unlikely to be found by users configuring their kernel builds anyway. Drop these outdated remarks in Kconfig help texts referring to older compiler and binutils versions. No functional change. Link: https://lkml.kernel.org/r/20251010082138.185752-1-lukas.bulwahn@redhat.com Signed-off-by: Lukas Bulwahn Cc: Bill Wendling Cc: Justin Stitt Cc: Nathan Chancellor Cc: Russel King Signed-off-by: Andrew Morton --- arch/Kconfig | 19 ++++++++----------- arch/arm/Kconfig | 2 -- lib/Kconfig.debug | 3 +-- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 61130b88964b..31220f512b16 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -232,17 +232,14 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS config ARCH_USE_BUILTIN_BSWAP bool help - Modern versions of GCC (since 4.4) have builtin functions - for handling byte-swapping. Using these, instead of the old - inline assembler that the architecture code provides in the - __arch_bswapXX() macros, allows the compiler to see what's - happening and offers more opportunity for optimisation. In - particular, the compiler will be able to combine the byteswap - with a nearby load or store and use load-and-swap or - store-and-swap instructions if the architecture has them. It - should almost *never* result in code which is worse than the - hand-coded assembler in . But just in case it - does, the use of the builtins is optional. + GCC and Clang have builtin functions for handling byte-swapping. + Using these allows the compiler to see what's happening and + offers more opportunity for optimisation. In particular, the + compiler will be able to combine the byteswap with a nearby load + or store and use load-and-swap or store-and-swap instructions if + the architecture has them. It should almost *never* result in code + which is worse than the hand-coded assembler in . + But just in case it does, the use of the builtins is optional. Any architecture with load-and-swap or store-and-swap instructions should set this. And it shouldn't hurt to set it diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4fb985b76e97..ff61891abe53 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1161,8 +1161,6 @@ config AEABI disambiguate both ABIs and allow for backward compatibility support (selected with CONFIG_OABI_COMPAT). - To use this you need GCC version 4.0.0 or later. - config OABI_COMPAT bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)" depends on AEABI && !THUMB2_KERNEL diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3034e294d50d..e89c024dcbdf 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -332,8 +332,7 @@ config DEBUG_INFO_COMPRESSED_ZLIB depends on $(cc-option,-gz=zlib) depends on $(ld-option,--compress-debug-sections=zlib) help - Compress the debug information using zlib. Requires GCC 5.0+ or Clang - 5.0+, binutils 2.26+, and zlib. + Compress the debug information using zlib. Users of dpkg-deb via debian/rules may find an increase in size of their debug .deb packages with this config set, due to the From 2f26f58df041bbcf692730ff4d8ab0f250d9670d Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 7 Oct 2025 15:35:26 +0300 Subject: [PATCH 012/139] ocfs2: annotate flexible array members with __counted_by_le() Annotate flexible array members of 'struct ocfs2_extent_list', 'struct ocfs2_chain_list', 'struct ocfs2_truncate_log', 'struct ocfs2_dx_entry_list', 'ocfs2_refcount_list' and 'struct ocfs2_xattr_header' with '__counted_by_le()' attribute to improve array bounds checking when CONFIG_UBSAN_BOUNDS is enabled. [dmantipov@yandex.ru: fix __counted_by_le() usage in ocfs2_expand_inline_dx_root()] Link: https://lkml.kernel.org/r/20251014070324.130313-1-dmantipov@yandex.ru Link: https://lkml.kernel.org/r/20251007123526.213150-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reviewed-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dir.c | 9 +++++++-- fs/ocfs2/ocfs2_fs.h | 22 ++++++++++++++-------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8c9c4825f984..48b092bc83d4 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4104,10 +4104,15 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, } dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; - memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - - offsetof(struct ocfs2_dx_root_block, dr_list)); + + dx_root->dr_list.l_tree_depth = 0; dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + dx_root->dr_list.l_next_free_rec = 0; + memset(&dx_root->dr_list.l_recs, 0, + osb->sb->s_blocksize - + (offsetof(struct ocfs2_dx_root_block, dr_list) + + offsetof(struct ocfs2_extent_list, l_recs))); /* This should never fail considering we start with an empty * dx_root. */ diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index ae0e44e5f2ad..f7763da5c4a2 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -468,7 +468,8 @@ struct ocfs2_extent_list { __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ -/*10*/ struct ocfs2_extent_rec l_recs[]; /* Extent records */ + /* Extent records */ +/*10*/ struct ocfs2_extent_rec l_recs[] __counted_by_le(l_count); }; /* @@ -482,7 +483,8 @@ struct ocfs2_chain_list { __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; -/*10*/ struct ocfs2_chain_rec cl_recs[]; /* Chain records */ + /* Chain records */ +/*10*/ struct ocfs2_chain_rec cl_recs[] __counted_by_le(cl_count); }; /* @@ -494,7 +496,8 @@ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; -/*08*/ struct ocfs2_truncate_rec tl_recs[]; /* Truncate records */ + /* Truncate records */ +/*08*/ struct ocfs2_truncate_rec tl_recs[] __counted_by_le(tl_count); }; /* @@ -796,9 +799,10 @@ struct ocfs2_dx_entry_list { * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ - struct ocfs2_dx_entry de_entries[]; /* Indexed dir entries - * in a packed array of - * length de_num_used */ + /* Indexed dir entries in a packed + * array of length de_num_used. + */ + struct ocfs2_dx_entry de_entries[] __counted_by_le(de_count); }; #define OCFS2_DX_FLAG_INLINE 0x01 @@ -934,7 +938,8 @@ struct ocfs2_refcount_list { __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ -/*10*/ struct ocfs2_refcount_rec rl_recs[]; /* Refcount records */ + /* Refcount records */ +/*10*/ struct ocfs2_refcount_rec rl_recs[] __counted_by_le(rl_count); }; @@ -1020,7 +1025,8 @@ struct ocfs2_xattr_header { buckets. A block uses xb_check and sets this field to zero.) */ - struct ocfs2_xattr_entry xh_entries[]; /* xattr entry list. */ + /* xattr entry list. */ + struct ocfs2_xattr_entry xh_entries[] __counted_by_le(xh_count); }; /* From 1b34743c31fef06d3728abf58126a7bcadb5ba1b Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 7 Oct 2025 12:46:26 +0300 Subject: [PATCH 013/139] ocfs2: add extra consistency check to ocfs2_dx_dir_lookup_rec() In 'ocfs2_dx_dir_lookup_rec()', check whether an extent list length of the directory indexing block matches the one configured via the superblock parameters established at mount, thus preventing an out-of-bounds accesses while iterating over the extent records below. Link: https://lkml.kernel.org/r/20251007094626.196143-1-dmantipov@yandex.ru Reported-by: syzbot+30b53487d00b4f7f0922@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=30b53487d00b4f7f0922 Signed-off-by: Dmitry Antipov Reviewed-by: Joseph Qi Reviewed-by: Heming Zhao > Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 48b092bc83d4..0fcaa0e90747 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -778,6 +778,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec = NULL; + if (le16_to_cpu(el->l_count) != + ocfs2_extent_recs_per_dx_root(inode->i_sb)) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has invalid extent list length %u\n", + inode->i_ino, le16_to_cpu(el->l_count)); + goto out; + } + if (el->l_tree_depth) { ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, &eb_bh); From 05d6f1cc2dc214c1491181be13f37d2a3a26f694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 13 Oct 2025 11:12:02 +0200 Subject: [PATCH 014/139] compiler.h: remove ARCH_SEL() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its last user was removed in commit 8ea815399c3f ("compiler: remove __ADDRESSABLE_ASM{_STR,}() again"). Link: https://lkml.kernel.org/r/20251013-arch-sel-v1-1-7eef9b22ceb0@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton --- include/linux/compiler.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..a9a2f8aae821 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -269,12 +269,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_64BIT -#define ARCH_SEL(a,b) a -#else -#define ARCH_SEL(a,b) b -#endif - /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined From 9544f9e6947f6508d29f0d0cc2dacaa749fc1613 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 15 Oct 2025 14:36:15 +0800 Subject: [PATCH 015/139] hung_task: panic when there are more than N hung tasks at the same time The hung_task_panic sysctl is currently a blunt instrument: it's all or nothing. Panicking on a single hung task can be an overreaction to a transient glitch. A more reliable indicator of a systemic problem is when multiple tasks hang simultaneously. Extend hung_task_panic to accept an integer threshold, allowing the kernel to panic only when N hung tasks are detected in a single scan. This provides finer control to distinguish between isolated incidents and system-wide failures. The accepted values are: - 0: Don't panic (unchanged) - 1: Panic on the first hung task (unchanged) - N > 1: Panic after N hung tasks are detected in a single scan The original behavior is preserved for values 0 and 1, maintaining full backward compatibility. [lance.yang@linux.dev: new changelog] Link: https://lkml.kernel.org/r/20251015063615.2632-1-lirongqing@baidu.com Signed-off-by: Li RongQing Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Lance Yang Tested-by: Lance Yang Acked-by: Andrew Jeffery [aspeed_g5_defconfig] Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Florian Wesphal Cc: Jakub Kacinski Cc: Jason A. Donenfeld Cc: Joel Granados Cc: Joel Stanley Cc: Jonathan Corbet Cc: Kees Cook Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Paul E . McKenney" Cc: Pawan Gupta Cc: Petr Mladek Cc: Phil Auld Cc: Randy Dunlap Cc: Russell King Cc: Shuah Khan Cc: Simon Horman Cc: Stanislav Fomichev Cc: Steven Rostedt Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 20 ++++++++++++------- Documentation/admin-guide/sysctl/kernel.rst | 9 +++++---- arch/arm/configs/aspeed_g5_defconfig | 2 +- kernel/configs/debug.config | 2 +- kernel/hung_task.c | 15 +++++++++----- lib/Kconfig.debug | 9 +++++---- .../selftests/wireguard/qemu/kernel.config | 2 +- 7 files changed, 36 insertions(+), 23 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..b8f8f5d74093 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2010,14 +2010,20 @@ the added memory block itself do not be affected. hung_task_panic= - [KNL] Should the hung task detector generate panics. - Format: 0 | 1 + [KNL] Number of hung tasks to trigger kernel panic. + Format: - A value of 1 instructs the kernel to panic when a - hung task is detected. The default value is controlled - by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time - option. The value selected by this boot parameter can - be changed later by the kernel.hung_task_panic sysctl. + When set to a non-zero value, a kernel panic will be triggered if + the number of detected hung tasks reaches this value. + + 0: don't panic + 1: panic immediately on first hung task + N: panic after N hung tasks are detected in a single scan + + The default value is controlled by the + CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value + selected by this boot parameter can be changed later by the + kernel.hung_task_panic sysctl. hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) terminal devices. Valid values: 0..8 diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index f3ee807b5d8b..0065a55bc09e 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -397,13 +397,14 @@ a hung task is detected. hung_task_panic =============== -Controls the kernel's behavior when a hung task is detected. +When set to a non-zero value, a kernel panic will be triggered if the +number of hung tasks found during a single scan reaches this value. This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. -= ================================================= += ======================================================= 0 Continue operation. This is the default behavior. -1 Panic immediately. -= ================================================= +N Panic when N hung tasks are found during a single scan. += ======================================================= hung_task_check_count diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 61cee1e7ebea..c3b0d5f06889 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -308,7 +308,7 @@ CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y -CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set CONFIG_FUNCTION_TRACER=y diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e81327d2cd63..9f6ab7dabf67 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y # # Debug Oops, Lockups and Hangs # -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2c1f14b8129..84b4b049faa5 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -81,7 +81,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = - IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); + CONFIG_BOOTPARAM_HUNG_TASK_PANIC; static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) @@ -218,8 +218,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti } #endif -static void check_hung_task(struct task_struct *t, unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout, + unsigned long prev_detect_count) { + unsigned long total_hung_task; + if (!task_is_hung(t, timeout)) return; @@ -229,9 +232,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) */ sysctl_hung_task_detect_count++; + total_hung_task = sysctl_hung_task_detect_count - prev_detect_count; trace_sched_process_hang(t); - if (sysctl_hung_task_panic) { + if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { console_verbose(); hung_task_show_lock = true; hung_task_call_panic = true; @@ -300,6 +304,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; + unsigned long prev_detect_count = sysctl_hung_task_detect_count; /* * If the system crashed already then all bets are off, @@ -320,7 +325,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) last_break = jiffies; } - check_hung_task(t, timeout); + check_hung_task(t, timeout, prev_detect_count); } unlock: rcu_read_unlock(); @@ -389,7 +394,7 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "hung_task_check_count", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e89c024dcbdf..19592a57e1ed 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1257,12 +1257,13 @@ config DEFAULT_HUNG_TASK_TIMEOUT Keeping the default should be fine in most cases. config BOOTPARAM_HUNG_TASK_PANIC - bool "Panic (Reboot) On Hung Tasks" + int "Number of hung tasks to trigger kernel panic" depends on DETECT_HUNG_TASK + default 0 help - Say Y here to enable the kernel to panic on "hung tasks", - which are bugs that cause the kernel to leave a task stuck - in uninterruptible "D" state. + When set to a non-zero value, a kernel panic will be triggered + if the number of hung tasks found during a single scan reaches + this value. The panic can be used in combination with panic_timeout, to cause the system to reboot automatically after a diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 936b18be07cf..0504c11c2de6 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -81,7 +81,7 @@ CONFIG_WQ_WATCHDOG=y CONFIG_DETECT_HUNG_TASK=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y -CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_PANIC_TIMEOUT=-1 CONFIG_STACKTRACE=y CONFIG_EARLY_PRINTK=y From 57f3d89691149f11bfb6c4fef9fca4890def8fb1 Mon Sep 17 00:00:00 2001 From: Ankan Biswas Date: Tue, 14 Oct 2025 10:54:36 +0530 Subject: [PATCH 016/139] lib/xz: remove dead IA-64 (Itanium) support code Support for the IA-64 (Itanium) architecture was removed in commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). This patch drops the IA-64 specific decompression code from lib/xz, which was conditionally compiled with the now-obsolete CONFIG_XZ_DEC_IA64 option. Link: https://lkml.kernel.org/r/20251014052738.31185-1-spyjetfayed@gmail.com Signed-off-by: Ankan Biswas Reviewed-by: Kuan-Wei Chiu Reviewed-by: Khalid Aziz Acked-by: Lasse Collin Cc: David Hunter Cc: Shuah Khan Signed-off-by: Andrew Morton --- lib/xz/xz_dec_bcj.c | 95 --------------------------------------------- lib/xz/xz_private.h | 4 -- 2 files changed, 99 deletions(-) diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c index 8237db17eee3..610d58d947ab 100644 --- a/lib/xz/xz_dec_bcj.c +++ b/lib/xz/xz_dec_bcj.c @@ -20,7 +20,6 @@ struct xz_dec_bcj { enum { BCJ_X86 = 4, /* x86 or x86-64 */ BCJ_POWERPC = 5, /* Big endian only */ - BCJ_IA64 = 6, /* Big or little endian */ BCJ_ARM = 7, /* Little endian only */ BCJ_ARMTHUMB = 8, /* Little endian only */ BCJ_SPARC = 9, /* Big or little endian */ @@ -180,92 +179,6 @@ static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) } #endif -#ifdef XZ_DEC_IA64 -static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) -{ - static const uint8_t branch_table[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 4, 4, 6, 6, 0, 0, 7, 7, - 4, 4, 0, 0, 4, 4, 0, 0 - }; - - /* - * The local variables take a little bit stack space, but it's less - * than what LZMA2 decoder takes, so it doesn't make sense to reduce - * stack usage here without doing that for the LZMA2 decoder too. - */ - - /* Loop counters */ - size_t i; - size_t j; - - /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ - uint32_t slot; - - /* Bitwise offset of the instruction indicated by slot */ - uint32_t bit_pos; - - /* bit_pos split into byte and bit parts */ - uint32_t byte_pos; - uint32_t bit_res; - - /* Address part of an instruction */ - uint32_t addr; - - /* Mask used to detect which instructions to convert */ - uint32_t mask; - - /* 41-bit instruction stored somewhere in the lowest 48 bits */ - uint64_t instr; - - /* Instruction normalized with bit_res for easier manipulation */ - uint64_t norm; - - size &= ~(size_t)15; - - for (i = 0; i < size; i += 16) { - mask = branch_table[buf[i] & 0x1F]; - for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { - if (((mask >> slot) & 1) == 0) - continue; - - byte_pos = bit_pos >> 3; - bit_res = bit_pos & 7; - instr = 0; - for (j = 0; j < 6; ++j) - instr |= (uint64_t)(buf[i + j + byte_pos]) - << (8 * j); - - norm = instr >> bit_res; - - if (((norm >> 37) & 0x0F) == 0x05 - && ((norm >> 9) & 0x07) == 0) { - addr = (norm >> 13) & 0x0FFFFF; - addr |= ((uint32_t)(norm >> 36) & 1) << 20; - addr <<= 4; - addr -= s->pos + (uint32_t)i; - addr >>= 4; - - norm &= ~((uint64_t)0x8FFFFF << 13); - norm |= (uint64_t)(addr & 0x0FFFFF) << 13; - norm |= (uint64_t)(addr & 0x100000) - << (36 - 20); - - instr &= (1 << bit_res) - 1; - instr |= norm << bit_res; - - for (j = 0; j < 6; j++) - buf[i + j + byte_pos] - = (uint8_t)(instr >> (8 * j)); - } - } - } - - return i; -} -#endif - #ifdef XZ_DEC_ARM static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { @@ -509,11 +422,6 @@ static void bcj_apply(struct xz_dec_bcj *s, filtered = bcj_powerpc(s, buf, size); break; #endif -#ifdef XZ_DEC_IA64 - case BCJ_IA64: - filtered = bcj_ia64(s, buf, size); - break; -#endif #ifdef XZ_DEC_ARM case BCJ_ARM: filtered = bcj_arm(s, buf, size); @@ -699,9 +607,6 @@ enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) #ifdef XZ_DEC_POWERPC case BCJ_POWERPC: #endif -#ifdef XZ_DEC_IA64 - case BCJ_IA64: -#endif #ifdef XZ_DEC_ARM case BCJ_ARM: #endif diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 8409784b1639..6775078f3cce 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -24,9 +24,6 @@ # ifdef CONFIG_XZ_DEC_POWERPC # define XZ_DEC_POWERPC # endif -# ifdef CONFIG_XZ_DEC_IA64 -# define XZ_DEC_IA64 -# endif # ifdef CONFIG_XZ_DEC_ARM # define XZ_DEC_ARM # endif @@ -103,7 +100,6 @@ */ #ifndef XZ_DEC_BCJ # if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ - || defined(XZ_DEC_IA64) \ || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ || defined(XZ_DEC_SPARC) || defined(XZ_DEC_ARM64) \ || defined(XZ_DEC_RISCV) From 01ef0296d2eb550511334212650011446c223da0 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Tue, 14 Oct 2025 13:07:47 +0800 Subject: [PATCH 017/139] .mailmap: add entry for WangYuli Map my old, obsolete work email address to my current email address. My current work email may not be ideal for timely communication, as it requires a secure network environment for access due to security policies. Therefore, associate both my previous and current work email addresses with an email address provided to me by AOSC Linux community. During work hours, my commits will likely still be authored using my company email address. Link: https://lkml.kernel.org/r/20251014050747.527357-1-wangyuli@aosc.io Signed-off-by: WangYuli Signed-off-by: WangYuli Cc: Carlos Bilbao Cc: Jarkko Sakkinen Cc: Shannon Nelson Signed-off-by: Andrew Morton --- .mailmap | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index 24ddc20f3b46..a72b618cf8ce 100644 --- a/.mailmap +++ b/.mailmap @@ -842,6 +842,9 @@ Vivien Didelot Vlad Dogaru Vladimir Davydov Vladimir Davydov +WangYuli +WangYuli +WangYuli Weiwen Hu WeiXiong Liao Wen Gong From adc15829fb73e402903b7030729263b6ee4a7232 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 16 Oct 2025 19:58:31 +0530 Subject: [PATCH 018/139] crash: let architecture decide crash memory export to iomem_resource With the generic crashkernel reservation, the kernel emits the following warning on powerpc: WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries NIP: c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000 REGS: c000000127cef8a0 TRAP: 0700 Not tainted (6.17.0-auto-12607-g5472d60c129f) MSR: 8000000002029033 CR: 84000840 XER: 20040010 CFAR: c00000000017eed0 IRQMASK: 0 GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001 GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000 GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808 GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950 GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710 NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180 LR [c00000000201de34] add_system_ram_resources+0xf4/0x180 Call Trace: add_system_ram_resources+0xf4/0x180 (unreliable) do_one_initcall+0x60/0x36c do_initcalls+0x120/0x220 kernel_init_freeable+0x23c/0x390 kernel_init+0x34/0x26c ret_from_kernel_user_thread+0x14/0x1c This warning occurs due to a conflict between crashkernel and System RAM iomem resources. The generic crashkernel reservation adds the crashkernel memory range to /proc/iomem during early initialization. Later, all memblock ranges are added to /proc/iomem as System RAM. If the crashkernel region overlaps with any memblock range, it causes a conflict while adding those memblock regions as iomem resources, triggering the above warning. The conflicting memblock regions are then omitted from /proc/iomem. For example, if the following crashkernel region is added to /proc/iomem: 20000000-11fffffff : Crash kernel then the following memblock regions System RAM regions fail to be inserted: 00000000-7fffffff : System RAM 80000000-257fffffff : System RAM Fix this by not adding the crashkernel memory to /proc/iomem on powerpc. Introduce an architecture hook to let each architecture decide whether to export the crashkernel region to /proc/iomem. For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM resource to prevent crashkernel conflict") Note: Before switching to the generic crashkernel reservation, powerpc never exported the crashkernel region to /proc/iomem. Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation"). Signed-off-by: Sourabh Jain Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.com/ Tested-by: Venkat Rao Bagalkote Cc: Baoquan he Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Cc: Ritesh Harjani (IBM) Cc: Vivek Goyal Cc: Dave Young Cc: Mike Rapoport Cc: Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/crash_reserve.h | 8 ++++++++ include/linux/crash_reserve.h | 6 ++++++ kernel/crash_reserve.c | 3 +++ 3 files changed, 17 insertions(+) diff --git a/arch/powerpc/include/asm/crash_reserve.h b/arch/powerpc/include/asm/crash_reserve.h index 6467ce29b1fa..d1b570ddbf98 100644 --- a/arch/powerpc/include/asm/crash_reserve.h +++ b/arch/powerpc/include/asm/crash_reserve.h @@ -5,4 +5,12 @@ /* crash kernel regions are Page size agliged */ #define CRASH_ALIGN PAGE_SIZE +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static inline bool arch_add_crash_res_to_iomem(void) +{ + return false; +} +#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem +#endif + #endif /* _ASM_POWERPC_CRASH_RESERVE_H */ diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 7b44b41d0a20..f0dc03d94ca2 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, void __init reserve_crashkernel_cma(unsigned long long cma_size); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef arch_add_crash_res_to_iomem +static inline bool arch_add_crash_res_to_iomem(void) +{ + return true; +} +#endif #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) #endif diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 87bf4d41eabb..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size) #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); From ed4bbe7e8fa186b24c61aa22a32885d5de0fe1a4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2025 15:16:26 -0700 Subject: [PATCH 019/139] taint: add reminder about updating docs and scripts Sometimes people update taint-related pieces of the kernel without updating the supporting documentation or scripts. Add a reminder to do this. Link: https://lkml.kernel.org/r/20251015221626.1126156-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Cc: David Gow Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- kernel/panic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/panic.c b/kernel/panic.c index 24cc3eec1805..ec59cade1f83 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -638,6 +638,12 @@ EXPORT_SYMBOL(panic); /* * TAINT_FORCED_RMMOD could be a per-module flag but the module * is being removed anyway. + * + * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, + * please also modify tools/debugging/kernel-chktaint and + * Documentation/admin-guide/tainted-kernels.rst, including its + * small shell script that prints the TAINT_FLAGS_COUNT bits of + * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), From 37ade54f386c829597f74b54bad335c12bd2a698 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 22 Oct 2025 10:28:04 +0200 Subject: [PATCH 020/139] taint/module: remove unnecessary taint_flag.module field The TAINT_RANDSTRUCT and TAINT_FWCTL flags are mistakenly set in the taint_flags table as per-module flags. While this can be trivially corrected, the issue can be avoided altogether by removing the taint_flag.module field. This is possible because, since commit 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") in 2016, the handling of module taint flags has been fully generic. Specifically, module_flags_taint() can print all flags, and the required output buffer size is properly defined in terms of TAINT_FLAGS_COUNT. The actual per-module flags are always those added to module.taints by calls to add_taint_module(). Link: https://lkml.kernel.org/r/20251022082938.26670-1-petr.pavlu@suse.com Signed-off-by: Petr Pavlu Acked-by: Petr Mladek Reviewed-by: Randy Dunlap Cc: Aaron Tomlin Cc: Luis Chamberalin Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- include/linux/panic.h | 1 - kernel/module/main.c | 2 +- kernel/panic.c | 46 ++++++++++++++++++++----------------------- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/include/linux/panic.h b/include/linux/panic.h index 6f972a66c13e..a00bc0937698 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -86,7 +86,6 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) struct taint_flag { char c_true; /* character printed when tainted */ char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ const char *desc; /* verbose description of the set taint flag */ }; diff --git a/kernel/module/main.c b/kernel/module/main.c index c66b26184936..6f219751df7e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &taints)) + if (test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } diff --git a/kernel/panic.c b/kernel/panic.c index ec59cade1f83..ffceb6f13935 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -628,17 +628,13 @@ void panic(const char *fmt, ...) } EXPORT_SYMBOL(panic); -#define TAINT_FLAG(taint, _c_true, _c_false, _module) \ +#define TAINT_FLAG(taint, _c_true, _c_false) \ [ TAINT_##taint ] = { \ .c_true = _c_true, .c_false = _c_false, \ - .module = _module, \ .desc = #taint, \ } /* - * TAINT_FORCED_RMMOD could be a per-module flag but the module - * is being removed anyway. - * * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, * please also modify tools/debugging/kernel-chktaint and * Documentation/admin-guide/tainted-kernels.rst, including its @@ -646,26 +642,26 @@ EXPORT_SYMBOL(panic); * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), - TAINT_FLAG(FORCED_MODULE, 'F', ' ', true), - TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false), - TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false), - TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false), - TAINT_FLAG(BAD_PAGE, 'B', ' ', false), - TAINT_FLAG(USER, 'U', ' ', false), - TAINT_FLAG(DIE, 'D', ' ', false), - TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false), - TAINT_FLAG(WARN, 'W', ' ', false), - TAINT_FLAG(CRAP, 'C', ' ', true), - TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false), - TAINT_FLAG(OOT_MODULE, 'O', ' ', true), - TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true), - TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false), - TAINT_FLAG(LIVEPATCH, 'K', ' ', true), - TAINT_FLAG(AUX, 'X', ' ', true), - TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), - TAINT_FLAG(TEST, 'N', ' ', true), - TAINT_FLAG(FWCTL, 'J', ' ', true), + TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), + TAINT_FLAG(FORCED_MODULE, 'F', ' '), + TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '), + TAINT_FLAG(FORCED_RMMOD, 'R', ' '), + TAINT_FLAG(MACHINE_CHECK, 'M', ' '), + TAINT_FLAG(BAD_PAGE, 'B', ' '), + TAINT_FLAG(USER, 'U', ' '), + TAINT_FLAG(DIE, 'D', ' '), + TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '), + TAINT_FLAG(WARN, 'W', ' '), + TAINT_FLAG(CRAP, 'C', ' '), + TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '), + TAINT_FLAG(OOT_MODULE, 'O', ' '), + TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '), + TAINT_FLAG(SOFTLOCKUP, 'L', ' '), + TAINT_FLAG(LIVEPATCH, 'K', ' '), + TAINT_FLAG(AUX, 'X', ' '), + TAINT_FLAG(RANDSTRUCT, 'T', ' '), + TAINT_FLAG(TEST, 'N', ' '), + TAINT_FLAG(FWCTL, 'J', ' '), }; #undef TAINT_FLAG From aa5b6a72ccd9fad2d2ed875631f6aca6b0633d80 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 13 Oct 2025 13:37:09 +0300 Subject: [PATCH 021/139] ocfs2: add directory size check to ocfs2_find_dir_space_id() Fix a null-pointer-deref which was detected by UBSAN: KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 UID: 0 PID: 5317 Comm: syz-executor310 Not tainted 6.15.0-syzkaller-12141-gec7714e49479 #0 PREEMPT(full) In 'ocfs2_find_dir_space_id()', add extra check whether the directory data block is large enough to hold at least one directory entry, and raise 'ocfs2_error()' if the former is unexpectedly small. Link: https://lkml.kernel.org/r/20251013103709.146001-1-dmantipov@yandex.ru Reported-by: syzbot+ded9116588a7b73c34bc@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ded9116588a7b73c34bc Signed-off-by: Dmitry Antipov Reviewed-by: Heming Zhao Cc: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 0fcaa0e90747..dc3d66263335 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3431,6 +3431,14 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, offset += le16_to_cpu(de->rec_len); } + if (!last_de) { + ret = ocfs2_error(sb, "Directory entry (#%llu: size=%lld) " + "is unexpectedly short", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + i_size_read(dir)); + goto out; + } + /* * We're going to require expansion of the directory - figure * out how many blocks we'll need so that a place for the From 7229d74e5e8c1f140529d405c88d4493e37ce4e3 Mon Sep 17 00:00:00 2001 From: Vlad Kulikov Date: Tue, 21 Oct 2025 21:13:39 +0300 Subject: [PATCH 022/139] ipc: create_ipc_ns: drop mqueue mount on sysctl setup failure If setup_mq_sysctls(ns) fails after mq_init_ns(ns) succeeds, the error path skipped releasing the internal kernel mqueue mount kept in ns->mq_mnt. That leaves the vfsmount/superblock referenced until final namespace teardown, i.e. a resource leak on this rare failure edge. Unwind it by calling mntput(ns->mq_mnt) before dropping user_ns and freeing the IPC namespace. This mirrors the normal ordering used in free_ipc_ns(). Link: https://lkml.kernel.org/r/20251021181341.670297-1-vlad_kulikov_c@pm.me Signed-off-by: Vlad Kulikov Reviewed-by: Jan Kara Cc: Aleksa Sarai Cc: Christian Brauner Cc: David Hildenbrand Cc: Ma Wupeng Signed-off-by: Andrew Morton --- ipc/namespace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ipc/namespace.c b/ipc/namespace.c index 59b12fcb40bd..cf62d11a09b9 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -75,10 +75,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, err = -ENOMEM; if (!setup_mq_sysctls(ns)) - goto fail_put; + goto fail_mq_mount; if (!setup_ipc_sysctls(ns)) - goto fail_mq; + goto fail_mq_sysctls; err = msg_init_ns(ns); if (err) @@ -92,9 +92,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, fail_ipc: retire_ipc_sysctls(ns); -fail_mq: +fail_mq_sysctls: retire_mq_sysctls(ns); - +fail_mq_mount: + mntput(ns->mq_mnt); fail_put: put_user_ns(ns->user_ns); ns_common_free(ns); From 032a730268a3e88a1df1df1bafd9af64a460822e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 23 Oct 2025 11:33:05 -0700 Subject: [PATCH 023/139] init/main.c: wrap long kernel cmdline when printing to logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel cmdline length is allowed to be longer than what printk can handle. When this happens the cmdline that's printed to the kernel ring buffer at bootup is cutoff and some kernel cmdline options are "hidden" from the logs. This undercuts the usefulness of the log message. Specifically, grepping for COMMAND_LINE_SIZE shows that 2048 is common and some architectures even define it as 4096. s390 allows a CONFIG-based maximum up to 1MB (though it's not expected that anyone will go over the default max of 4096 [1]). The maximum message pr_notice() seems to be able to handle (based on experiment) is 1021 characters. This appears to be based on the current value of PRINTKRB_RECORD_MAX as 1024 and the fact that pr_notice() spends 2 characters on the loglevel prefix and we have a '\n' at the end. While it would be possible to increase the limits of printk() (and therefore pr_notice()) somewhat, it doesn't appear possible to increase it enough to fully include a 2048-character cmdline without breaking userspace. Specifically on at least two tested userspaces (ChromeOS plus the Debian-based distro I'm typing this message on) the `dmesg` tool reads lines from `/dev/kmsg` in 2047-byte chunks. As per `Documentation/ABI/testing/dev-kmsg`: Every read() from the opened device node receives one record of the kernel's printk buffer. ... Messages in the record ring buffer get overwritten as whole, there are never partial messages received by read(). We simply can't fit a 2048-byte cmdline plus the "Kernel command line:" prefix plus info about time/log_level/etc in a 2047-byte read. The above means that if we want to avoid the truncation we need to do some type of wrapping of the cmdline when printing. Add wrapping to the printout of the kernel command line. By default, the wrapping is set to 1021 characters to avoid breaking anyone, but allow wrapping to be set lower by a Kconfig knob "CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN". Any tools that are correctly parsing the cmdline today (because it is less than 1021 characters) will see no difference in their behavior. The format of wrapped output is designed to be matched by anyone using "grep" to search for the cmdline and also to be easy for tools to handle. Anyone who is sure their tools (if any) handle the wrapped format can choose a lower wrapping value and have prettier output. Setting CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN to 0 fully disables the wrapping logic. This means that long command lines will be truncated again, but this config could be set if command lines are expected to be long and userspace is known not to handle parsing logs with the wrapping. Wrapping is based on spaces, ignoring quotes. All lines are prefixed with "Kernel command line: " and lines that are not the last line have a " \" suffix added to them. The prefix and suffix count towards the line length for wrapping purposes. The ideal length will be exceeded if no appropriate place to wrap is found. The wrapping function added here is fairly generic and could be made a library function (somewhat like print_hex_dump()) if it's needed elsewhere in the kernel. However, having printk() directly incorporate this wrapping would be unlikely to be a good idea since it would break printouts into more than one record without any obvious common line prefix to tie lines together. It would also be extra overhead when, in general, kernel log message should simply be kept smaller than 1021 bytes. For some discussion on this topic, see responses to the v1 posting of this patch [2]. [akpm@linux-foundation.org: make print_kernel_cmdline __init] [dianders@chromium.org: v4] Link: https://lkml.kernel.org/r/20251027082204.v4.1.I095f1e2c6c27f9f4de0b4841f725f356c643a13f@changeid Link: https://lkml.kernel.org/r/20251023113257.v3.1.I095f1e2c6c27f9f4de0b4841f725f356c643a13f@changeid Link: https://lore.kernel.org/r/20251021131633.26700Dd6-hca@linux.ibm.com [1] Link: https://lore.kernel.org/r/CAD=FV=VNyt1zG_8pS64wgV8VkZWiWJymnZ-XCfkrfaAhhFSKcA@mail.gmail.com [2] Signed-off-by: Douglas Anderson Tested-by: Paul E. McKenney Cc: Alexander Shishkin Cc: Andrew Chant Cc: Brian Gerst Cc: Christian Brauner Cc: Francesco Valla Cc: Geert Uytterhoeven Cc: guoweikang Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Hendrik Farr Cc: Jeff Xu Cc: Kees Cook Cc: Masahiro Yamada Cc: Michal Koutný Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Shakeel Butt Cc: Sven Schnelle Cc: Tejun Heo Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- init/Kconfig | 18 ++++++++++ init/main.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49..56a5dec1fdfc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1512,6 +1512,24 @@ config BOOT_CONFIG_EMBED_FILE This bootconfig will be used if there is no initrd or no other bootconfig in the initrd. +config CMDLINE_LOG_WRAP_IDEAL_LEN + int "Length to try to wrap the cmdline when logged at boot" + default 1021 + range 0 1021 + help + At boot time, the kernel command line is logged to the console. + The log message will start with the prefix "Kernel command line: ". + The log message will attempt to be wrapped (split into multiple log + messages) at spaces based on CMDLINE_LOG_WRAP_IDEAL_LEN characters. + If wrapping happens, each log message will start with the prefix and + all but the last message will end with " \". Messages may exceed the + ideal length if a place to wrap isn't found before the specified + number of characters. + + A value of 0 disables wrapping, though be warned that the maximum + length of a log message (1021 characters) may cause the cmdline to + be truncated. + config INITRAMFS_PRESERVE_MTIME bool "Preserve cpio archive mtimes in initramfs" depends on BLK_DEV_INITRD diff --git a/init/main.c b/init/main.c index 07a3116811c5..b84818ad9685 100644 --- a/init/main.c +++ b/init/main.c @@ -906,6 +906,101 @@ static void __init early_numa_node_init(void) #endif } +#define KERNEL_CMDLINE_PREFIX "Kernel command line: " +#define KERNEL_CMDLINE_PREFIX_LEN (sizeof(KERNEL_CMDLINE_PREFIX) - 1) +#define KERNEL_CMDLINE_CONTINUATION " \\" +#define KERNEL_CMDLINE_CONTINUATION_LEN (sizeof(KERNEL_CMDLINE_CONTINUATION) - 1) + +#define MIN_CMDLINE_LOG_WRAP_IDEAL_LEN (KERNEL_CMDLINE_PREFIX_LEN + \ + KERNEL_CMDLINE_CONTINUATION_LEN) +#define CMDLINE_LOG_WRAP_IDEAL_LEN (CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN > \ + MIN_CMDLINE_LOG_WRAP_IDEAL_LEN ? \ + CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN : \ + MIN_CMDLINE_LOG_WRAP_IDEAL_LEN) + +#define IDEAL_CMDLINE_LEN (CMDLINE_LOG_WRAP_IDEAL_LEN - KERNEL_CMDLINE_PREFIX_LEN) +#define IDEAL_CMDLINE_SPLIT_LEN (IDEAL_CMDLINE_LEN - KERNEL_CMDLINE_CONTINUATION_LEN) + +/** + * print_kernel_cmdline() - Print the kernel cmdline with wrapping. + * @cmdline: The cmdline to print. + * + * Print the kernel command line, trying to wrap based on the Kconfig knob + * CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN. + * + * Wrapping is based on spaces, ignoring quotes. All lines are prefixed + * with "Kernel command line: " and lines that are not the last line have + * a " \" suffix added to them. The prefix and suffix count towards the + * line length for wrapping purposes. The ideal length will be exceeded + * if no appropriate place to wrap is found. + * + * Example output if CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN is 40: + * Kernel command line: loglevel=7 \ + * Kernel command line: init=/sbin/init \ + * Kernel command line: root=PARTUUID=8c3efc1a-768b-6642-8d0c-89eb782f19f0/PARTNROFF=1 \ + * Kernel command line: rootwait ro \ + * Kernel command line: my_quoted_arg="The \ + * Kernel command line: quick brown fox \ + * Kernel command line: jumps over the \ + * Kernel command line: lazy dog." + */ +static void __init print_kernel_cmdline(const char *cmdline) +{ + size_t len; + + /* Config option of 0 or anything longer than the max disables wrapping */ + if (CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN == 0 || + IDEAL_CMDLINE_LEN >= COMMAND_LINE_SIZE - 1) { + pr_notice("%s%s\n", KERNEL_CMDLINE_PREFIX, cmdline); + return; + } + + len = strlen(cmdline); + while (len > IDEAL_CMDLINE_LEN) { + const char *first_space; + const char *prev_cutoff; + const char *cutoff; + int to_print; + size_t used; + + /* Find the last ' ' that wouldn't make the line too long */ + prev_cutoff = NULL; + cutoff = cmdline; + while (true) { + cutoff = strchr(cutoff + 1, ' '); + if (!cutoff || cutoff - cmdline > IDEAL_CMDLINE_SPLIT_LEN) + break; + prev_cutoff = cutoff; + } + if (prev_cutoff) + cutoff = prev_cutoff; + else if (!cutoff) + break; + + /* Find the beginning and end of the string of spaces */ + first_space = cutoff; + while (first_space > cmdline && first_space[-1] == ' ') + first_space--; + to_print = first_space - cmdline; + while (*cutoff == ' ') + cutoff++; + used = cutoff - cmdline; + + /* If the whole string is used, break and do the final printout */ + if (len == used) + break; + + if (to_print) + pr_notice("%s%.*s%s\n", KERNEL_CMDLINE_PREFIX, + to_print, cmdline, KERNEL_CMDLINE_CONTINUATION); + + len -= used; + cmdline += used; + } + if (len) + pr_notice("%s%s\n", KERNEL_CMDLINE_PREFIX, cmdline); +} + asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector void start_kernel(void) { @@ -942,7 +1037,7 @@ void start_kernel(void) early_numa_node_init(); boot_cpu_hotplug_init(); - pr_notice("Kernel command line: %s\n", saved_command_line); + print_kernel_cmdline(saved_command_line); /* parameters may set static keys */ parse_early_param(); after_dashes = parse_args("Booting kernel", From d99dc586ca7c7729450af2ed39ca1483c0eb7b5c Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Thu, 23 Oct 2025 13:16:06 -0400 Subject: [PATCH 024/139] uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1f9a8286bc0c ("uaccess: always export _copy_[from|to]_user with CONFIG_RUST") exports _copy_{from,to}_user() unconditionally, if RUST is enabled. This pollutes exported symbols namespace, and spreads RUST ifdefery in core files. It's better to declare a corresponding helper under the rust/helpers, similarly to how non-underscored copy_{from,to}_user() is handled. [yury.norov@gmail.com: drop rust part of comment for _copy_from_user(), per Alice] Link: https://lkml.kernel.org/r/20251024154754.99768-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20251023171607.1171534-1-yury.norov@gmail.com Signed-off-by: Yury Norov (NVIDIA) Acked-by: Arnd Bergmann Acked-by: Miguel Ojeda Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: John Hubbard Cc: Trevor Gross Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 2 -- lib/usercopy.c | 4 ++-- rust/helpers/uaccess.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..01cbd7dd0ba3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -152,8 +152,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. - * - * Rust code always uses the extern definition. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) diff --git a/lib/usercopy.c b/lib/usercopy.c index 7b17b83c8042..b00a3a957de6 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,7 +12,7 @@ /* out-of-line parts */ -#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) +#if !defined(INLINE_COPY_FROM_USER) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { return _inline_copy_from_user(to, from, n); @@ -20,7 +20,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n EXPORT_SYMBOL(_copy_from_user); #endif -#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) +#if !defined(INLINE_COPY_TO_USER) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { return _inline_copy_to_user(to, from, n); diff --git a/rust/helpers/uaccess.c b/rust/helpers/uaccess.c index f49076f813cd..4629b2d15529 100644 --- a/rust/helpers/uaccess.c +++ b/rust/helpers/uaccess.c @@ -13,3 +13,15 @@ unsigned long rust_helper_copy_to_user(void __user *to, const void *from, { return copy_to_user(to, from, n); } + +#ifdef INLINE_COPY_FROM_USER +unsigned long rust_helper__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return _inline_copy_from_user(to, from, n); +} + +unsigned long rust_helper__copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return _inline_copy_to_user(to, from, n); +} +#endif From 390ac56cf0f687de53695648bc6f2259a7eae429 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 13 Oct 2025 09:28:26 +0300 Subject: [PATCH 025/139] ocfs2: add boundary check to ocfs2_check_dir_entry() In 'ocfs2_check_dir_entry()', add extra check whether at least the smallest possible dirent may be located at the specified offset within bh's data, thus preventing an out-of-bounds accesses below. Link: https://lkml.kernel.org/r/20251013062826.122586-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reported-by: syzbot+b20bbf680bb0f2ecedae@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b20bbf680bb0f2ecedae Reviewed-by: Heming Zhao Cc: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dir.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index dc3d66263335..2785ff245e79 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -302,8 +302,21 @@ static int ocfs2_check_dir_entry(struct inode *dir, unsigned long offset) { const char *error_msg = NULL; - const int rlen = le16_to_cpu(de->rec_len); - const unsigned long next_offset = ((char *) de - buf) + rlen; + unsigned long next_offset; + int rlen; + + if (offset > size - OCFS2_DIR_REC_LEN(1)) { + /* Dirent is (maybe partially) beyond the buffer + * boundaries so touching 'de' members is unsafe. + */ + mlog(ML_ERROR, "directory entry (#%llu: offset=%lu) " + "too close to end or out-of-bounds", + (unsigned long long)OCFS2_I(dir)->ip_blkno, offset); + return 0; + } + + rlen = le16_to_cpu(de->rec_len); + next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; From c9dff86eb78a4b6b02b1e407993c946ccaf9bfb4 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Sat, 25 Oct 2025 20:32:17 +0800 Subject: [PATCH 026/139] ocfs2: use correct endian in ocfs2_dinode_has_extents Fields in ocfs2_dinode is little endian, covert to host endian when checking those contents. Link: https://lkml.kernel.org/r/20251025123218.3997866-1-joseph.qi@linux.alibaba.com Fixes: fdbb6cd96ed5 ("ocfs2: correct l_next_free_rec in online check") Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index fcc89856ab95..0a0a96054bfe 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -201,13 +201,15 @@ bail: static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di) { /* inodes flagged with other stuff in id2 */ - if (di->i_flags & (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | - OCFS2_CHAIN_FL | OCFS2_DEALLOC_FL)) + if (le32_to_cpu(di->i_flags) & + (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | OCFS2_CHAIN_FL | + OCFS2_DEALLOC_FL)) return 0; /* i_flags doesn't indicate when id2 is a fast symlink */ - if (S_ISLNK(di->i_mode) && di->i_size && di->i_clusters == 0) + if (S_ISLNK(le16_to_cpu(di->i_mode)) && le64_to_cpu(di->i_size) && + !le32_to_cpu(di->i_clusters)) return 0; - if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) return 0; return 1; From 6e89373cec09af28dd54fc5eed00cde34992f8e2 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Sat, 25 Oct 2025 20:32:18 +0800 Subject: [PATCH 027/139] ocfs2: convert to host endian in ocfs2_validate_inode_block Convert to host endian when checking OCFS2_VALID_FL to keep consistent with other checks. Link: https://lkml.kernel.org/r/20251025123218.3997866-2-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Changwei Ge Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0a0a96054bfe..dc4044a565b5 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1481,7 +1481,7 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } - if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + if (!(le32_to_cpu(di->i_flags) & OCFS2_VALID_FL)) { rc = ocfs2_error(sb, "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", (unsigned long long)bh->b_blocknr); From a2b1c419ff72ec62ff5831684e30cd1d4f0b09ee Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 23 Oct 2025 17:16:50 +0300 Subject: [PATCH 028/139] ocfs2: add inline inode consistency check to ocfs2_validate_inode_block() In 'ocfs2_validate_inode_block()', add an extra check whether an inode with inline data (i.e. self-contained) has no clusters, thus preventing an invalid inode from being passed to 'ocfs2_evict_inode()' and below. Link: https://lkml.kernel.org/r/20251023141650.417129-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reported-by: syzbot+c16daba279a1161acfb0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c16daba279a1161acfb0 Reviewed-by: Joseph Qi Cc: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index dc4044a565b5..dbc38a212c8f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1505,6 +1505,14 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } + if ((le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) && + le32_to_cpu(di->i_clusters)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: %u clusters\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_clusters)); + goto bail; + } + rc = 0; bail: From 6c2e6e2c1af1809d1d9cdbd50ac80f54f5995bdb Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 25 Oct 2025 16:00:03 +0800 Subject: [PATCH 029/139] dynamic_debug: add support for print stack In practical problem diagnosis, especially during the boot phase, it is often desirable to know the call sequence. However, currently, apart from adding print statements and recompiling the kernel, there seems to be no good alternative. If dynamic_debug supported printing the call stack, it would be very helpful for diagnosing issues. This patch add support '+d' for dump stack. Link: https://lkml.kernel.org/r/20251025080003.312536-1-yebin@huaweicloud.com Signed-off-by: Ye Bin Cc: Jason Baron Cc: Jim Cromie Signed-off-by: Andrew Morton --- .../admin-guide/dynamic-debug-howto.rst | 5 +++-- include/linux/dynamic_debug.h | 17 ++++++++++++++--- lib/dynamic_debug.c | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 7c036590cd07..095a63892257 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -223,12 +223,13 @@ The flags are:: f Include the function name s Include the source file name l Include line number + d Include call trace For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only the ``p`` flag has meaning, other flags are ignored. -Note the regexp ``^[-+=][fslmpt_]+$`` matches a flags specification. -To clear all flags at once, use ``=_`` or ``-fslmpt``. +Note the regexp ``^[-+=][fslmptd_]+$`` matches a flags specification. +To clear all flags at once, use ``=_`` or ``-fslmptd``. Debug messages during Boot Process diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index ff44ec346162..05743900a116 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -38,11 +38,12 @@ struct _ddebug { #define _DPRINTK_FLAGS_INCL_LINENO (1<<3) #define _DPRINTK_FLAGS_INCL_TID (1<<4) #define _DPRINTK_FLAGS_INCL_SOURCENAME (1<<5) +#define _DPRINTK_FLAGS_INCL_STACK (1<<6) #define _DPRINTK_FLAGS_INCL_ANY \ (_DPRINTK_FLAGS_INCL_MODNAME | _DPRINTK_FLAGS_INCL_FUNCNAME |\ _DPRINTK_FLAGS_INCL_LINENO | _DPRINTK_FLAGS_INCL_TID |\ - _DPRINTK_FLAGS_INCL_SOURCENAME) + _DPRINTK_FLAGS_INCL_SOURCENAME | _DPRINTK_FLAGS_INCL_STACK) #if defined DEBUG #define _DPRINTK_FLAGS_DEFAULT _DPRINTK_FLAGS_PRINT @@ -160,6 +161,12 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, const struct ib_device *ibdev, const char *fmt, ...); +#define __dynamic_dump_stack(desc) \ +{ \ + if (desc.flags & _DPRINTK_FLAGS_INCL_STACK) \ + dump_stack(); \ +} + #define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt) \ static struct _ddebug __aligned(8) \ __section("__dyndbg") name = { \ @@ -220,8 +227,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, */ #define __dynamic_func_call_cls(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(&id, ##__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call(id, fmt, func, ...) \ __dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt, \ @@ -229,8 +238,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call_no_desc(id, fmt, func, ...) \ __dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT, \ diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 5a007952f7f2..7d7892e57a01 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -95,6 +95,7 @@ static const struct { unsigned flag:8; char opt_char; } opt_array[] = { { _DPRINTK_FLAGS_INCL_SOURCENAME, 's' }, { _DPRINTK_FLAGS_INCL_LINENO, 'l' }, { _DPRINTK_FLAGS_INCL_TID, 't' }, + { _DPRINTK_FLAGS_INCL_STACK, 'd' }, { _DPRINTK_FLAGS_NONE, '_' }, }; From a0b8c6af29a4be3ca2ff9a95cf71e54db5d73e65 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 24 Oct 2025 21:51:20 +0100 Subject: [PATCH 030/139] lib/xxhash: remove more unused xxh functions xxh32_reset() and xxh32_copy_state() are unused, and with those gone, the xxh32_state struct is also unused. xxh64_copy_state() is also unused. Remove them all. (Also fixes a comment above the xxh64_state that referred to it as xxh32_state). Link: https://lkml.kernel.org/r/20251024205120.454508-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Suggested-by: Christoph Hellwig Reviewed-by: Kuan-Wei Chiu Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/xxhash.h | 46 +----------------------------------------- lib/xxhash.c | 29 -------------------------- 2 files changed, 1 insertion(+), 74 deletions(-) diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 27f57eca8cb1..587122e2c29c 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -141,21 +141,7 @@ static inline unsigned long xxhash(const void *input, size_t length, */ /** - * struct xxh32_state - private xxh32 state, do not use members directly - */ -struct xxh32_state { - uint32_t total_len_32; - uint32_t large_len; - uint32_t v1; - uint32_t v2; - uint32_t v3; - uint32_t v4; - uint32_t mem32[4]; - uint32_t memsize; -}; - -/** - * struct xxh32_state - private xxh64 state, do not use members directly + * struct xxh64_state - private xxh64 state, do not use members directly */ struct xxh64_state { uint64_t total_len; @@ -167,16 +153,6 @@ struct xxh64_state { uint32_t memsize; }; -/** - * xxh32_reset() - reset the xxh32 state to start a new hashing operation - * - * @state: The xxh32 state to reset. - * @seed: Initialize the hash state with this seed. - * - * Call this function on any xxh32_state to prepare for a new hashing operation. - */ -void xxh32_reset(struct xxh32_state *state, uint32_t seed); - /** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * @@ -210,24 +186,4 @@ int xxh64_update(struct xxh64_state *state, const void *input, size_t length); */ uint64_t xxh64_digest(const struct xxh64_state *state); -/*-************************** - * Utils - ***************************/ - -/** - * xxh32_copy_state() - copy the source state into the destination state - * - * @src: The source xxh32 state. - * @dst: The destination xxh32 state. - */ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src); - -/** - * xxh64_copy_state() - copy the source state into the destination state - * - * @src: The source xxh64 state. - * @dst: The destination xxh64 state. - */ -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src); - #endif /* XXHASH_H */ diff --git a/lib/xxhash.c b/lib/xxhash.c index cf629766f376..4125b3e3cf7f 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -73,21 +73,6 @@ static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; -/*-************************** - * Utils - ***************************/ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) -{ - memcpy(dst, src, sizeof(*dst)); -} -EXPORT_SYMBOL(xxh32_copy_state); - -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) -{ - memcpy(dst, src, sizeof(*dst)); -} -EXPORT_SYMBOL(xxh64_copy_state); - /*-*************************** * Simple Hash Functions ****************************/ @@ -239,20 +224,6 @@ EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions ***************************************************/ -void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) -{ - /* use a local state for memcpy() to avoid strict-aliasing warnings */ - struct xxh32_state state; - - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; - memcpy(statePtr, &state, sizeof(state)); -} -EXPORT_SYMBOL(xxh32_reset); - void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ From f9925019f49e75f425623f46f379182dc3a1e8cd Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Mon, 27 Oct 2025 18:03:09 +0800 Subject: [PATCH 031/139] mailmap: add entry for Yu-Chun Lin Map my personal email to my business email. Link: https://lkml.kernel.org/r/20251027100309.22035-1-eleanor.lin@realtek.com Signed-off-by: Yu-Chun Lin Cc: Kuan-Wei Chiu Cc: Stanley Chang Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index a72b618cf8ce..384c6c309bac 100644 --- a/.mailmap +++ b/.mailmap @@ -856,6 +856,7 @@ Yakir Yang Yanteng Si Ying Huang Yosry Ahmed +Yu-Chun Lin Yusuke Goda Zack Rusin Zhu Yanjun From c25d24d0f4c6c0954f2b0eb1fc69d293e88192bf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 26 Oct 2025 15:31:40 +0100 Subject: [PATCH 032/139] release_task: kill unnecessary rcu_read_lock() around dec_rlimit_ucounts() rcu_read_lock() was added to shut RCU-lockdep up when this code used __task_cred()->rcu_dereference(), but after the commit 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") it is no longer needed: task_ucounts()->task_cred_xxx() takes rcu_read_lock() itself. NOTE: task_ucounts() returns the pointer to another rcu-protected data, struct ucounts. So it should either be used when task->real_cred and thus task->real_cred->ucounts is stable (release_task, copy_process, copy_creds), or it should be called under rcu_read_lock(). In both cases it is pointless to take rcu_read_lock() to read the cred->ucounts pointer. Link: https://lkml.kernel.org/r/20251026143140.GA22463@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Alexey Gladkov Cc: David Howells Cc: Mateusz Guzik Cc: "Paul E . McKenney" Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/exit.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..f041f0c05ebb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -251,10 +251,8 @@ repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); + * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - rcu_read_unlock(); pidfs_exit(p); cgroup_release(p); From ded7d97442f0f5af25e942949ac304e025b85165 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Thu, 30 Oct 2025 20:17:46 +0800 Subject: [PATCH 033/139] mailmap: add entry for Hao Ge Use hao.ge@linux.dev as the main address for kernel work Link: https://lkml.kernel.org/r/20251030121746.230747-1-hao.ge@linux.dev Signed-off-by: Hao Ge Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 384c6c309bac..017d357019ac 100644 --- a/.mailmap +++ b/.mailmap @@ -298,6 +298,7 @@ Hans de Goede Hans Verkuil Hans Verkuil Hans Verkuil +Hao Ge Harry Yoo <42.hyeyoo@gmail.com> Heiko Carstens Heiko Carstens From 3e4b89e970365db7fe2c4171be06c0e88e3bcbe6 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 31 Oct 2025 00:46:43 +0900 Subject: [PATCH 034/139] nilfs2: replace vmalloc + copy_from_user with vmemdup_user Replace vmalloc() followed by copy_from_user() with vmemdup_user() to improve nilfs_ioctl_clean_segments() and nilfs_ioctl_set_suinfo(). Use kvfree() to free the buffers created by vmemdup_user(). Use u64_to_user_ptr() instead of manually casting the pointers and remove the obsolete 'out_free' label. No functional changes intended. Link: https://lkml.kernel.org/r/20251030154700.7444-1-konishi.ryusuke@gmail.com Signed-off-by: Thorsten Blum Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/ioctl.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3288c3b4be9e..e17b8da66491 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -49,7 +49,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, void *, size_t, size_t)) { void *buf; - void __user *base = (void __user *)(unsigned long)argv->v_base; + void __user *base = u64_to_user_ptr(argv->v_base); size_t maxmembs, total, n; ssize_t nr; int ret, i; @@ -836,7 +836,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, sizeof(struct nilfs_bdesc), sizeof(__u64), }; - void __user *base; void *kbufs[5]; struct the_nilfs *nilfs; size_t len, nsegs; @@ -863,7 +862,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, * use kmalloc() for its buffer because the memory used for the * segment numbers is small enough. */ - kbufs[4] = memdup_array_user((void __user *)(unsigned long)argv[4].v_base, + kbufs[4] = memdup_array_user(u64_to_user_ptr(argv[4].v_base), nsegs, sizeof(__u64)); if (IS_ERR(kbufs[4])) { ret = PTR_ERR(kbufs[4]); @@ -883,20 +882,14 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, goto out_free; len = argv[n].v_size * argv[n].v_nmembs; - base = (void __user *)(unsigned long)argv[n].v_base; if (len == 0) { kbufs[n] = NULL; continue; } - kbufs[n] = vmalloc(len); - if (!kbufs[n]) { - ret = -ENOMEM; - goto out_free; - } - if (copy_from_user(kbufs[n], base, len)) { - ret = -EFAULT; - vfree(kbufs[n]); + kbufs[n] = vmemdup_user(u64_to_user_ptr(argv[n].v_base), len); + if (IS_ERR(kbufs[n])) { + ret = PTR_ERR(kbufs[n]); goto out_free; } } @@ -928,7 +921,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, out_free: while (--n >= 0) - vfree(kbufs[n]); + kvfree(kbufs[n]); kfree(kbufs[4]); out: mnt_drop_write_file(filp); @@ -1181,7 +1174,6 @@ static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, struct nilfs_transaction_info ti; struct nilfs_argv argv; size_t len; - void __user *base; void *kbuf; int ret; @@ -1212,18 +1204,12 @@ static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, goto out; } - base = (void __user *)(unsigned long)argv.v_base; - kbuf = vmalloc(len); - if (!kbuf) { - ret = -ENOMEM; + kbuf = vmemdup_user(u64_to_user_ptr(argv.v_base), len); + if (IS_ERR(kbuf)) { + ret = PTR_ERR(kbuf); goto out; } - if (copy_from_user(kbuf, base, len)) { - ret = -EFAULT; - goto out_free; - } - nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size, argv.v_nmembs); @@ -1232,8 +1218,7 @@ static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, else nilfs_transaction_commit(inode->i_sb); /* never fails */ -out_free: - vfree(kbuf); + kvfree(kbuf); out: mnt_drop_write_file(filp); return ret; From d79a3aeb747c17095d679cc4402d87f0e7c3405e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:17 +0100 Subject: [PATCH 035/139] panic: sys_info: capture si_bits_global before iterating over it Patch series "panic: sys_info: Refactor and fix a potential issue", v3. While targeting the compilation issue due to dangling variable, I have noticed more opportunities for refactoring that helps to avoid above mentioned compilation issue in a cleaner way and also fixes a potential problem with global variable access. This patch (of 6): The for-loop might re-read the content of the memory the si_bits_global points to on each iteration. Instead, just capture it for the sake of consistency and use that instead. Link: https://lkml.kernel.org/r/20251030132007.3742368-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20251030132007.3742368-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Feng Tang Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton --- lib/sys_info.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/sys_info.c b/lib/sys_info.c index 496f9151c9b6..d542a024406a 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -58,11 +58,11 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, char names[sizeof(sys_info_avail)]; struct ctl_table table; unsigned long *si_bits_global; + unsigned long si_bits; si_bits_global = ro_table->data; if (write) { - unsigned long si_bits; int ret; table = *ro_table; @@ -81,9 +81,12 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, char *delim = ""; int i, len = 0; + /* The access to the global value is not synchronized. */ + si_bits = READ_ONCE(*si_bits_global); + names[0] = '\0'; for (i = 0; i < ARRAY_SIZE(si_names); i++) { - if (*si_bits_global & si_names[i].bit) { + if (si_bits & si_names[i].bit) { len += scnprintf(names + len, sizeof(names) - len, "%s%s", delim, si_names[i].name); delim = ","; From 760fc597c33d5a727507c8bb19d6ab87a8c5885b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:18 +0100 Subject: [PATCH 036/139] panic: sys_info: align constant definition names with parameters Align constant definition names with parameters to make it easier to map. It's also better to maintain and extend the names while keeping their uniqueness. Link: https://lkml.kernel.org/r/20251030132007.3742368-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Feng Tang Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton --- include/linux/sys_info.h | 2 +- kernel/panic.c | 2 +- lib/sys_info.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h index 89d77dc4f2ed..a5bc3ea3d44b 100644 --- a/include/linux/sys_info.h +++ b/include/linux/sys_info.h @@ -14,7 +14,7 @@ #define SYS_INFO_LOCKS 0x00000008 #define SYS_INFO_FTRACE 0x00000010 #define SYS_INFO_PANIC_CONSOLE_REPLAY 0x00000020 -#define SYS_INFO_ALL_CPU_BT 0x00000040 +#define SYS_INFO_ALL_BT 0x00000040 #define SYS_INFO_BLOCKED_TASKS 0x00000080 void sys_info(unsigned long si_mask); diff --git a/kernel/panic.c b/kernel/panic.c index ffceb6f13935..a9af1bbe16b0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -401,7 +401,7 @@ static void panic_trigger_all_cpu_backtrace(void) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) + if (panic_print & SYS_INFO_ALL_BT) panic_trigger_all_cpu_backtrace(); /* diff --git a/lib/sys_info.c b/lib/sys_info.c index d542a024406a..6b0188b30227 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -23,7 +23,7 @@ static const struct sys_info_name si_names[] = { { SYS_INFO_TIMERS, "timers" }, { SYS_INFO_LOCKS, "locks" }, { SYS_INFO_FTRACE, "ftrace" }, - { SYS_INFO_ALL_CPU_BT, "all_bt" }, + { SYS_INFO_ALL_BT, "all_bt" }, { SYS_INFO_BLOCKED_TASKS, "blocked_tasks" }, }; @@ -118,7 +118,7 @@ void sys_info(unsigned long si_mask) if (si_mask & SYS_INFO_FTRACE) ftrace_dump(DUMP_ALL); - if (si_mask & SYS_INFO_ALL_CPU_BT) + if (si_mask & SYS_INFO_ALL_BT) trigger_all_cpu_backtrace(); if (si_mask & SYS_INFO_BLOCKED_TASKS) From d13adc6147f5848d6ad9900fdb1dbf9a280a2f64 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:19 +0100 Subject: [PATCH 037/139] panic: sys_info:replace struct sys_info_name with plain array of strings There is no need to keep a custom structure just for the need of a plain array of strings. Replace struct sys_info_name with plain array of strings. With that done, simplify the code, in particular, naturally use for_each_set_bit() when iterating over si_bits_global bitmap. Link: https://lkml.kernel.org/r/20251030132007.3742368-4-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Cc: Feng Tang Signed-off-by: Andrew Morton --- lib/sys_info.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/lib/sys_info.c b/lib/sys_info.c index 6b0188b30227..29a63238b31d 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -1,30 +1,29 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include +#include #include #include -#include #include +#include +#include +#include #include -struct sys_info_name { - unsigned long bit; - const char *name; -}; - /* * When 'si_names' gets updated, please make sure the 'sys_info_avail' * below is updated accordingly. */ -static const struct sys_info_name si_names[] = { - { SYS_INFO_TASKS, "tasks" }, - { SYS_INFO_MEM, "mem" }, - { SYS_INFO_TIMERS, "timers" }, - { SYS_INFO_LOCKS, "locks" }, - { SYS_INFO_FTRACE, "ftrace" }, - { SYS_INFO_ALL_BT, "all_bt" }, - { SYS_INFO_BLOCKED_TASKS, "blocked_tasks" }, +static const char * const si_names[] = { + [ilog2(SYS_INFO_TASKS)] = "tasks", + [ilog2(SYS_INFO_MEM)] = "mem", + [ilog2(SYS_INFO_TIMERS)] = "timers", + [ilog2(SYS_INFO_LOCKS)] = "locks", + [ilog2(SYS_INFO_FTRACE)] = "ftrace", + [ilog2(SYS_INFO_PANIC_CONSOLE_REPLAY)] = "", + [ilog2(SYS_INFO_ALL_BT)] = "all_bt", + [ilog2(SYS_INFO_BLOCKED_TASKS)] = "blocked_tasks", }; /* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */ @@ -36,12 +35,9 @@ unsigned long sys_info_parse_param(char *str) s = str; while ((name = strsep(&s, ",")) && *name) { - for (i = 0; i < ARRAY_SIZE(si_names); i++) { - if (!strcmp(name, si_names[i].name)) { - si_bits |= si_names[i].bit; - break; - } - } + i = match_string(si_names, ARRAY_SIZE(si_names), name); + if (i >= 0) + __set_bit(i, &si_bits); } return si_bits; @@ -85,10 +81,10 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, si_bits = READ_ONCE(*si_bits_global); names[0] = '\0'; - for (i = 0; i < ARRAY_SIZE(si_names); i++) { - if (si_bits & si_names[i].bit) { + for_each_set_bit(i, &si_bits, ARRAY_SIZE(si_names)) { + if (*si_names[i]) { len += scnprintf(names + len, sizeof(names) - len, - "%s%s", delim, si_names[i].name); + "%s%s", delim, si_names[i]); delim = ","; } } From eb72c4667f4567a7363f6e00d082d2ab32b6a03a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:20 +0100 Subject: [PATCH 038/139] panic: sys_info: rewrite a fix for a compilation error (`make W=1`) Compiler was not happy about dead variable in use: lib/sys_info.c:52:19: error: variable 'sys_info_avail' is not needed and will not be emitted [-Werror,-Wunneeded-internal-declaration] 52 | static const char sys_info_avail[] = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks"; | ^~~~~~~~~~~~~~ This was fixed by adding __maybe_unused attribute that just hides the issue and didn't actually fix the root cause. Rewrite the fix by moving the local variable from stack to a heap. As a side effect this drops unneeded "synchronisation" of duplicative info and also makes code ready for the further refactoring. Link: https://lkml.kernel.org/r/20251030132007.3742368-5-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Cc: Feng Tang Signed-off-by: Andrew Morton --- lib/sys_info.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/sys_info.c b/lib/sys_info.c index 29a63238b31d..eb5c1226bfc8 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include +#include #include #include #include @@ -11,10 +13,6 @@ #include -/* - * When 'si_names' gets updated, please make sure the 'sys_info_avail' - * below is updated accordingly. - */ static const char * const si_names[] = { [ilog2(SYS_INFO_TASKS)] = "tasks", [ilog2(SYS_INFO_MEM)] = "mem", @@ -45,25 +43,32 @@ unsigned long sys_info_parse_param(char *str) #ifdef CONFIG_SYSCTL -static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks"; - int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - char names[sizeof(sys_info_avail)]; struct ctl_table table; unsigned long *si_bits_global; unsigned long si_bits; + unsigned int i; + size_t maxlen; si_bits_global = ro_table->data; + maxlen = 0; + for (i = 0; i < ARRAY_SIZE(si_names); i++) + maxlen += strlen(si_names[i]) + 1; + + char *names __free(kfree) = kzalloc(maxlen, GFP_KERNEL); + if (!names) + return -ENOMEM; + if (write) { int ret; table = *ro_table; table.data = names; - table.maxlen = sizeof(names); + table.maxlen = maxlen; ret = proc_dostring(&table, write, buffer, lenp, ppos); if (ret) return ret; @@ -74,16 +79,15 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, return 0; } else { /* for 'read' operation */ + unsigned int len = 0; char *delim = ""; - int i, len = 0; /* The access to the global value is not synchronized. */ si_bits = READ_ONCE(*si_bits_global); - names[0] = '\0'; for_each_set_bit(i, &si_bits, ARRAY_SIZE(si_names)) { if (*si_names[i]) { - len += scnprintf(names + len, sizeof(names) - len, + len += scnprintf(names + len, maxlen - len, "%s%s", delim, si_names[i]); delim = ","; } @@ -91,7 +95,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, table = *ro_table; table.data = names; - table.maxlen = sizeof(names); + table.maxlen = maxlen; return proc_dostring(&table, write, buffer, lenp, ppos); } } From f791dcc842cb1cb3777ae4122be4cd37624ad53d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:21 +0100 Subject: [PATCH 039/139] panic: sys_info: deduplicate local variable 'table; assignments The both handlers use the local 'table' variable and assign the same data to it, deduplicate that. Link: https://lkml.kernel.org/r/20251030132007.3742368-6-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Feng Tang Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton --- lib/sys_info.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/sys_info.c b/lib/sys_info.c index eb5c1226bfc8..94526de8482b 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -63,12 +63,13 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, if (!names) return -ENOMEM; + table = *ro_table; + table.data = names; + table.maxlen = maxlen; + if (write) { int ret; - table = *ro_table; - table.data = names; - table.maxlen = maxlen; ret = proc_dostring(&table, write, buffer, lenp, ppos); if (ret) return ret; @@ -93,9 +94,6 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, } } - table = *ro_table; - table.data = names; - table.maxlen = maxlen; return proc_dostring(&table, write, buffer, lenp, ppos); } } From 9125163273f8033af5d38907b483c1d9f99d781b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:22 +0100 Subject: [PATCH 040/139] panic: sys_info: factor out read and write handlers For the sake of the code readability and easier maintenance factor out read and write sys_info handlers. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20251030132007.3742368-7-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Cc: Feng Tang Signed-off-by: Andrew Morton --- lib/sys_info.c | 79 +++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/lib/sys_info.c b/lib/sys_info.c index 94526de8482b..323624093e54 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -43,18 +43,56 @@ unsigned long sys_info_parse_param(char *str) #ifdef CONFIG_SYSCTL +static int sys_info_write_handler(const struct ctl_table *table, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long *si_bits_global) +{ + unsigned long si_bits; + int ret; + + ret = proc_dostring(table, 1, buffer, lenp, ppos); + if (ret) + return ret; + + si_bits = sys_info_parse_param(table->data); + + /* The access to the global value is not synchronized. */ + WRITE_ONCE(*si_bits_global, si_bits); + + return 0; +} + +static int sys_info_read_handler(const struct ctl_table *table, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long *si_bits_global) +{ + unsigned long si_bits; + unsigned int len = 0; + char *delim = ""; + unsigned int i; + + /* The access to the global value is not synchronized. */ + si_bits = READ_ONCE(*si_bits_global); + + for_each_set_bit(i, &si_bits, ARRAY_SIZE(si_names)) { + if (*si_names[i]) { + len += scnprintf(table->data + len, table->maxlen - len, + "%s%s", delim, si_names[i]); + delim = ","; + } + } + + return proc_dostring(table, 0, buffer, lenp, ppos); +} + int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table; - unsigned long *si_bits_global; - unsigned long si_bits; unsigned int i; size_t maxlen; - si_bits_global = ro_table->data; - maxlen = 0; for (i = 0; i < ARRAY_SIZE(si_names); i++) maxlen += strlen(si_names[i]) + 1; @@ -67,35 +105,10 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, table.data = names; table.maxlen = maxlen; - if (write) { - int ret; - - ret = proc_dostring(&table, write, buffer, lenp, ppos); - if (ret) - return ret; - - si_bits = sys_info_parse_param(names); - /* The access to the global value is not synchronized. */ - WRITE_ONCE(*si_bits_global, si_bits); - return 0; - } else { - /* for 'read' operation */ - unsigned int len = 0; - char *delim = ""; - - /* The access to the global value is not synchronized. */ - si_bits = READ_ONCE(*si_bits_global); - - for_each_set_bit(i, &si_bits, ARRAY_SIZE(si_names)) { - if (*si_names[i]) { - len += scnprintf(names + len, maxlen - len, - "%s%s", delim, si_names[i]); - delim = ","; - } - } - - return proc_dostring(&table, write, buffer, lenp, ppos); - } + if (write) + return sys_info_write_handler(&table, buffer, lenp, ppos, ro_table->data); + else + return sys_info_read_handler(&table, buffer, lenp, ppos, ro_table->data); } #endif From e1c70505ee8158c1108340d9cd67182ade93af4a Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 30 Oct 2025 18:30:02 +0300 Subject: [PATCH 041/139] ocfs2: add extra consistency checks for chain allocator dinodes When validating chain allocator dinode in 'ocfs2_validate_inode_block()', add an extra checks whether a) the maximum amount of chain records in 'struct ocfs2_chain_list' matches the value calculated based on the filesystem block size, and b) the next free slot index is within the valid range. Link: https://lkml.kernel.org/r/20251030153003.1934585-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reported-by: syzbot+77026564530dbc29b854@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=77026564530dbc29b854 Reported-by: syzbot+5054473a31f78f735416@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5054473a31f78f735416 Suggested-by: Joseph Qi Reviewed-by: Joseph Qi Cc: Junxiao Bi Cc: Jun Piao Cc: Deepanshu Kartikey Cc: Heming Zhao Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index dbc38a212c8f..0f39ce0a2d46 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1513,6 +1513,23 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } + if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) { + struct ocfs2_chain_list *cl = &di->id2.i_chain; + + if (le16_to_cpu(cl->cl_count) != ocfs2_chain_recs_per_inode(sb)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: chain list count %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_count)); + goto bail; + } + if (le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: chain list index %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_next_free_rec)); + goto bail; + } + } + rc = 0; bail: From 93ce0ff117b0c468961d7c296a03ad57e1e8da9f Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Thu, 30 Oct 2025 18:30:03 +0300 Subject: [PATCH 042/139] ocfs2: validate cl_bpc in allocator inodes to prevent divide-by-zero The chain allocator field cl_bpc (blocks per cluster) is read from disk and used in division operations without validation. A corrupted filesystem image with cl_bpc=0 causes a divide-by-zero crash in the kernel: divide error: 0000 [#1] PREEMPT SMP KASAN RIP: 0010:ocfs2_bg_discontig_add_extent fs/ocfs2/suballoc.c:335 [inline] RIP: 0010:ocfs2_block_group_fill+0x5bd/0xa70 fs/ocfs2/suballoc.c:386 Call Trace: ocfs2_block_group_alloc+0x7e9/0x1330 fs/ocfs2/suballoc.c:703 ocfs2_reserve_suballoc_bits+0x20a6/0x4640 fs/ocfs2/suballoc.c:834 ocfs2_reserve_new_inode+0x4f4/0xcc0 fs/ocfs2/suballoc.c:1074 ocfs2_mknod+0x83c/0x2050 fs/ocfs2/namei.c:306 This patch adds validation in ocfs2_validate_inode_block() to ensure cl_bpc matches the expected value calculated from the superblock's cluster size and block size for chain allocator inodes (identified by OCFS2_CHAIN_FL). Moving the validation to inode validation time (rather than allocation time) has several benefits: - Validates once when the inode is read, rather than on every allocation - Protects all code paths that use cl_bpc (allocation, resize, etc.) - Follows the existing pattern of inode validation in OCFS2 - Centralizes validation logic The validation catches both: - Zero values that cause divide-by-zero crashes - Non-zero but incorrect values indicating filesystem corruption or mismatched filesystem geometry With this fix, mounting a corrupted filesystem produces: OCFS2: ERROR (device loop0): ocfs2_validate_inode_block: Inode 74 has corrupted cl_bpc: ondisk=0 expected=16 instead of a kernel crash. [dmantipov@yandex.ru: combine into the series and tweak the message to fit the commonly used style] Link: https://lkml.kernel.org/r/20251030153003.1934585-2-dmantipov@yandex.ru Link: https://lore.kernel.org/ocfs2-devel/20251026132625.12348-1-kartikey406@gmail.com/T/#u [v1] Link: https://lore.kernel.org/all/20251027124131.10002-1-kartikey406@gmail.com/T/ [v2] Signed-off-by: Deepanshu Kartikey Signed-off-by: Dmitry Antipov Reported-by: syzbot+fd8af97c7227fe605d95@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=fd8af97c7227fe605d95 Tested-by: syzbot+fd8af97c7227fe605d95@syzkaller.appspotmail.com Suggested-by: Joseph Qi Reviewed-by: Joseph Qi Cc: Heming Zhao Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0f39ce0a2d46..8d6c106bcb98 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1515,6 +1515,8 @@ int ocfs2_validate_inode_block(struct super_block *sb, if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) { struct ocfs2_chain_list *cl = &di->id2.i_chain; + u16 bpc = 1 << (OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits); if (le16_to_cpu(cl->cl_count) != ocfs2_chain_recs_per_inode(sb)) { rc = ocfs2_error(sb, "Invalid dinode %llu: chain list count %u\n", @@ -1528,6 +1530,14 @@ int ocfs2_validate_inode_block(struct super_block *sb, le16_to_cpu(cl->cl_next_free_rec)); goto bail; } + if (OCFS2_SB(sb)->bitmap_blkno && + OCFS2_SB(sb)->bitmap_blkno != le64_to_cpu(di->i_blkno) && + le16_to_cpu(cl->cl_bpc) != bpc) { + rc = ocfs2_error(sb, "Invalid dinode %llu: bits per cluster %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_bpc)); + goto bail; + } } rc = 0; From 7f37d88f5cb32fff454f12cd99444686482ca23b Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 29 Oct 2025 13:27:43 +0100 Subject: [PATCH 043/139] lib/Kconfig.debug: cleanup CONFIG_DEBUG_SECTION_MISMATCH help text Simplify formulations, correct flow, split it into proper paragraphs and update structure. No functional changes. Link: https://lkml.kernel.org/r/20251029122743.1110-1-bp@kernel.org Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 19592a57e1ed..9a087826498a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -483,23 +483,23 @@ config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" depends on CC_IS_GCC help - The section mismatch analysis checks if there are illegal - references from one section to another section. - During linktime or runtime, some sections are dropped; - any use of code/data previously in these sections would - most likely result in an oops. - In the code, functions and variables are annotated with - __init,, etc. (see the full list in include/linux/init.h), - which results in the code/data being placed in specific sections. + The section mismatch analysis checks if there are illegal references + from one section to another. During linktime or runtime, some + sections are dropped; any use of code/data previously in these + sections would most likely result in an oops. + + In the code, functions and variables are annotated with __init, + __initdata, and so on (see the full list in include/linux/init.h). + This directs the toolchain to place code/data in specific sections. + The section mismatch analysis is always performed after a full - kernel build, and enabling this option causes the following - additional step to occur: - - Add the option -fno-inline-functions-called-once to gcc commands. - When inlining a function annotated with __init in a non-init - function, we would lose the section information and thus - the analysis would not catch the illegal reference. - This option tells gcc to inline less (but it does result in - a larger kernel). + kernel build, and enabling this option causes the option + -fno-inline-functions-called-once to be added to gcc commands. + + However, when inlining a function annotated with __init in + a non-init function, we would lose the section information and thus + the analysis would not catch the illegal reference. This option + tells gcc to inline less (but it does result in a larger kernel). config SECTION_MISMATCH_WARN_ONLY bool "Make section mismatch errors non-fatal" From 464c7ea5c3ffa333a2c1a8dfd68b157ced1edc5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Fri, 31 Oct 2025 12:19:09 +0100 Subject: [PATCH 044/139] checkpatch: add IDR to the deprecated list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As of commit 85656ec193e9, the IDR interface is marked as deprecated in the documentation, but no checks are made in that regard for new code. Add the existing IDR initialization APIs to the deprecated list in checkpatch, so that if new code is introduced using these APIs, a warning is emitted. Link: https://lkml.kernel.org/r/20251031111908.2266077-2-clopez@suse.de Signed-off-by: Carlos López Suggested-by: Dan Williams Acked-by: Dan Williams Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6729f18e5654..d58ca9655ab7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -860,6 +860,10 @@ our %deprecated_apis = ( "kunmap" => "kunmap_local", "kmap_atomic" => "kmap_local_page", "kunmap_atomic" => "kunmap_local", + #These should be enough to drive away new IDR users + "DEFINE_IDR" => "DEFINE_XARRAY", + "idr_init" => "xa_init", + "idr_init_base" => "xa_init_flags" ); #Create a search pattern for all these strings to speed up a loop below From bd97c976419126ee3e9acd4957f6f16a90316643 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Nov 2025 19:38:34 +0100 Subject: [PATCH 045/139] util_macros.h: fix kernel-doc for u64_to_user_ptr() The added documentation to u64_to_user_ptr() misspelled the function name. Fix it. Link: https://lkml.kernel.org/r/20251104183834.1046584-1-andriy.shevchenko@linux.intel.com Fixes: 029c896c4105 ("kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h") Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Signed-off-by: Andrew Morton --- include/linux/util_macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 9373962aade9..2eb528058d0d 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -136,10 +136,10 @@ #define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) /** - * to_user_ptr - cast a pointer passed as u64 from user space to void __user * + * u64_to_user_ptr - cast a pointer passed as u64 from user space to void __user * * @x: The u64 value from user space, usually via IOCTL * - * to_user_ptr() simply casts a pointer passed as u64 from user space to void + * u64_to_user_ptr() simply casts a pointer passed as u64 from user space to void * __user * correctly. Using this lets us get rid of all the tiresome casts. */ #define u64_to_user_ptr(x) \ From af9b65d6864aa5e94839d150b902168fd44c6107 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Nov 2025 07:19:20 -0500 Subject: [PATCH 046/139] kernel/hung_task: unexport sysctl_hung_task_timeout_secs This was added by the bcachefs pull requests despite various objections, and with bcachefs removed is now unused. This reverts commit 5c3273ec3c6a ("kernel/hung_task.c: export sysctl_hung_task_timeout_secs"). Link: https://lkml.kernel.org/r/20251104121920.2430568-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Lance Yang Reviewed-by: Masami Hiramatsu (Google) Cc: Kent Overstreet Signed-off-by: Andrew Morton --- kernel/hung_task.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 84b4b049faa5..5ac0e66a1361 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -50,7 +50,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: From 5944f875ac27cae8b831206aef011a444efa637d Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:27 +0000 Subject: [PATCH 047/139] lib: mul_u64_u64_div_u64(): rename parameter 'c' to 'd' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement mul_u64_u64_div_u64_roundup()", v5. The pwm-stm32.c code wants a 'rounding up' version of mul_u64_u64_div_u64(). This can be done simply by adding 'divisor - 1' to the 128bit product. Implement mul_u64_add_u64_div_u64(a, b, c, d) = (a * b + c)/d based on the existing code. Define mul_u64_u64_div_u64(a, b, d) as mul_u64_add_u64_div_u64(a, b, 0, d) and mul_u64_u64_div_u64_roundup(a, b, d) as mul_u64_add_u64_div_u64(a, b, d-1, d). Only x86-64 has an optimsed (asm) version of the function. That is optimised to avoid the 'add c' when c is known to be zero. In all other cases the extra code will be noise compared to the software divide code. The test module has been updated to test mul_u64_u64_div_u64_roundup() and also enhanced it to verify the C division code on x86-64 and the 32bit division code on 64bit. This patch (of 9): Change to prototype from mul_u64_u64_div_u64(u64 a, u64 b, u64 c) to mul_u64_u64_div_u64(u64 a, u64 b, u64 d). Using 'd' for 'divisor' makes more sense. An upcoming change adds a 'c' parameter to calculate (a * b + c)/d. Link: https://lkml.kernel.org/r/20251105201035.64043-1-david.laight.linux@gmail.com Link: https://lkml.kernel.org/r/20251105201035.64043-2-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/div64.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/math/div64.c b/lib/math/div64.c index bf77b9843175..0ebff850fd4d 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -184,10 +184,10 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) EXPORT_SYMBOL(iter_div_u64_rem); #ifndef mul_u64_u64_div_u64 -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) +u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) { if (ilog2(a) + ilog2(b) <= 62) - return div64_u64(a * b, c); + return div64_u64(a * b, d); #if defined(__SIZEOF_INT128__) @@ -212,37 +212,37 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) #endif - /* make sure c is not zero, trigger runtime exception otherwise */ - if (unlikely(c == 0)) { + /* make sure d is not zero, trigger runtime exception otherwise */ + if (unlikely(d == 0)) { unsigned long zero = 0; OPTIMIZER_HIDE_VAR(zero); return ~0UL/zero; } - int shift = __builtin_ctzll(c); + int shift = __builtin_ctzll(d); /* try reducing the fraction in case the dividend becomes <= 64 bits */ if ((n_hi >> shift) == 0) { u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo; - return div64_u64(n, c >> shift); + return div64_u64(n, d >> shift); /* * The remainder value if needed would be: - * res = div64_u64_rem(n, c >> shift, &rem); + * res = div64_u64_rem(n, d >> shift, &rem); * rem = (rem << shift) + (n_lo - (n << shift)); */ } - if (n_hi >= c) { + if (n_hi >= d) { /* overflow: result is unrepresentable in a u64 */ return -1; } /* Do the full 128 by 64 bits division */ - shift = __builtin_clzll(c); - c <<= shift; + shift = __builtin_clzll(d); + d <<= shift; int p = 64 + shift; u64 res = 0; @@ -257,8 +257,8 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) n_hi <<= shift; n_hi |= n_lo >> (64 - shift); n_lo <<= shift; - if (carry || (n_hi >= c)) { - n_hi -= c; + if (carry || (n_hi >= d)) { + n_hi -= d; res |= 1ULL << p; } } while (n_hi); From 08092babd362170e059330a6a2d44c2891d9dbac Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:28 +0000 Subject: [PATCH 048/139] lib: mul_u64_u64_div_u64(): combine overflow and divide by zero checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the overflow check always triggers when the divisor is zero move the check for divide by zero inside the overflow check. This means there is only one test in the normal path. Link: https://lkml.kernel.org/r/20251105201035.64043-3-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/div64.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/math/div64.c b/lib/math/div64.c index 0ebff850fd4d..1092f41e878e 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -212,12 +212,16 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) #endif - /* make sure d is not zero, trigger runtime exception otherwise */ - if (unlikely(d == 0)) { - unsigned long zero = 0; + if (unlikely(n_hi >= d)) { + /* trigger runtime exception if divisor is zero */ + if (d == 0) { + unsigned long zero = 0; - OPTIMIZER_HIDE_VAR(zero); - return ~0UL/zero; + OPTIMIZER_HIDE_VAR(zero); + return ~0UL/zero; + } + /* overflow: result is unrepresentable in a u64 */ + return ~0ULL; } int shift = __builtin_ctzll(d); @@ -234,11 +238,6 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) */ } - if (n_hi >= d) { - /* overflow: result is unrepresentable in a u64 */ - return -1; - } - /* Do the full 128 by 64 bits division */ shift = __builtin_clzll(d); From d91f891d588557874a45a5f584b6da0b433acee7 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:29 +0000 Subject: [PATCH 049/139] lib: mul_u64_u64_div_u64(): simplify check for a 64bit product MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the product is only 64bits div64_u64() can be used for the divide. Replace the pre-multiply check (ilog2(a) + ilog2(b) <= 62) with a simple post-multiply check that the high 64bits are zero. This has the advantage of being simpler, more accurate and less code. It will always be faster when the product is larger than 64bits. Most 64bit cpu have a native 64x64=128 bit multiply, this is needed (for the low 64bits) even when div64_u64() is called - so the early check gains nothing and is just extra code. 32bit cpu will need a compare (etc) to generate the 64bit ilog2() from two 32bit bit scans - so that is non-trivial. (Never mind the mess of x86's 'bsr' and any oddball cpu without fast bit-scan instructions.) Whereas the additional instructions for the 128bit multiply result are pretty much one multiply and two adds (typically the 'adc $0,%reg' can be run in parallel with the instruction that follows). The only outliers are 64bit systems without 128bit mutiply and simple in order 32bit ones with fast bit scan but needing extra instructions to get the high bits of the multiply result. I doubt it makes much difference to either, the latter is definitely not mainstream. If anyone is worried about the analysis they can look at the generated code for x86 (especially when cmov isn't used). Link: https://lkml.kernel.org/r/20251105201035.64043-4-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/div64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/math/div64.c b/lib/math/div64.c index 1092f41e878e..4a4b1aa9e6e1 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -186,9 +186,6 @@ EXPORT_SYMBOL(iter_div_u64_rem); #ifndef mul_u64_u64_div_u64 u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) { - if (ilog2(a) + ilog2(b) <= 62) - return div64_u64(a * b, d); - #if defined(__SIZEOF_INT128__) /* native 64x64=128 bits multiplication */ @@ -212,6 +209,9 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) #endif + if (!n_hi) + return div64_u64(n_lo, d); + if (unlikely(n_hi >= d)) { /* trigger runtime exception if divisor is zero */ if (d == 0) { From 6480241f31f543333ed0c7a209962412461f6e41 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:30 +0000 Subject: [PATCH 050/139] lib: add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' variant needs 'divisor - 1' adding in between the multiply and divide so cannot easily be done by a caller. Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d and implement the 'round down' and 'round up' using it. Update the x86-64 asm to optimise for 'c' being a constant zero. Add kerndoc definitions for all three functions. Link: https://lkml.kernel.org/r/20251105201035.64043-5-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- arch/x86/include/asm/div64.h | 20 +++++++++------ include/linux/math64.h | 48 +++++++++++++++++++++++++++++++++++- lib/math/div64.c | 14 ++++++----- 3 files changed, 67 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9931e4c7d73f..6d8a3de3f43a 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -84,21 +84,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b) * Will generate an #DE when the result doesn't fit u64, could fix with an * __ex_table[] entry when it becomes an issue. */ -static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div) +static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div) { - u64 q; + u64 rdx; - asm ("mulq %2; divq %3" : "=a" (q) - : "a" (a), "rm" (mul), "rm" (div) - : "rdx"); + asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul)); - return q; + if (!statically_true(!add)) + asm ("addq %[add], %[lo]; adcq $0, %[hi]" : + [lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add)); + + asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div)); + + return rax; } -#define mul_u64_u64_div_u64 mul_u64_u64_div_u64 +#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64 static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) { - return mul_u64_u64_div_u64(a, mul, div); + return mul_u64_add_u64_div_u64(a, mul, 0, div); } #define mul_u64_u32_div mul_u64_u32_div diff --git a/include/linux/math64.h b/include/linux/math64.h index 6aaccc1626ab..e889d850b7f1 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -282,7 +282,53 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ -u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); +/** + * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide + * @a: first unsigned 64bit multiplicand + * @b: second unsigned 64bit multiplicand + * @c: unsigned 64bit addend + * @d: unsigned 64bit divisor + * + * Multiply two 64bit values together to generate a 128bit product + * add a third value and then divide by a fourth. + * The Generic code divides by 0 if @d is zero and returns ~0 on overflow. + * Architecture specific code may trap on zero or overflow. + * + * Return: (@a * @b + @c) / @d + */ +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d); + +/** + * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide + * @a: first unsigned 64bit multiplicand + * @b: second unsigned 64bit multiplicand + * @d: unsigned 64bit divisor + * + * Multiply two 64bit values together to generate a 128bit product + * and then divide by a third value. + * The Generic code divides by 0 if @d is zero and returns ~0 on overflow. + * Architecture specific code may trap on zero or overflow. + * + * Return: @a * @b / @d + */ +#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d) + +/** + * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up + * @a: first unsigned 64bit multiplicand + * @b: second unsigned 64bit multiplicand + * @d: unsigned 64bit divisor + * + * Multiply two 64bit values together to generate a 128bit product + * and then divide and round up. + * The Generic code divides by 0 if @d is zero and returns ~0 on overflow. + * Architecture specific code may trap on zero or overflow. + * + * Return: (@a * @b + @d - 1) / @d + */ +#define mul_u64_u64_div_u64_roundup(a, b, d) \ + ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); }) + /** * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up diff --git a/lib/math/div64.c b/lib/math/div64.c index 4a4b1aa9e6e1..a88391b8ada0 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -183,13 +183,13 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) } EXPORT_SYMBOL(iter_div_u64_rem); -#ifndef mul_u64_u64_div_u64 -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) +#ifndef mul_u64_add_u64_div_u64 +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) { #if defined(__SIZEOF_INT128__) /* native 64x64=128 bits multiplication */ - u128 prod = (u128)a * b; + u128 prod = (u128)a * b + c; u64 n_lo = prod, n_hi = prod >> 64; #else @@ -198,8 +198,10 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32; u64 x, y, z; - x = (u64)a_lo * b_lo; - y = (u64)a_lo * b_hi + (u32)(x >> 32); + /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */ + x = (u64)a_lo * b_lo + (u32)c; + y = (u64)a_lo * b_hi + (u32)(c >> 32); + y += (u32)(x >> 32); z = (u64)a_hi * b_hi + (u32)(y >> 32); y = (u64)a_hi * b_lo + (u32)y; z += (u32)(y >> 32); @@ -265,5 +267,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d) return res; } -EXPORT_SYMBOL(mul_u64_u64_div_u64); +EXPORT_SYMBOL(mul_u64_add_u64_div_u64); #endif From 500db21917e8aaafd65360bfed35845d549aa3dd Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:31 +0000 Subject: [PATCH 051/139] lib: add tests for mul_u64_u64_div_u64_roundup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replicate the existing mul_u64_u64_div_u64() test cases with round up. Update the shell script that verifies the table, remove the comment markers so that it can be directly pasted into a shell. Rename the divisor from 'c' to 'd' to match mul_u64_add_u64_div_u64(). It any tests fail then fail the module load with -EINVAL. Link: https://lkml.kernel.org/r/20251105201035.64043-6-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/test_mul_u64_u64_div_u64.c | 122 +++++++++++++++++----------- 1 file changed, 73 insertions(+), 49 deletions(-) diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c index 58d058de4e73..4d5e4e5dac67 100644 --- a/lib/math/test_mul_u64_u64_div_u64.c +++ b/lib/math/test_mul_u64_u64_div_u64.c @@ -10,61 +10,73 @@ #include #include -typedef struct { u64 a; u64 b; u64 c; u64 result; } test_params; +typedef struct { u64 a; u64 b; u64 d; u64 result; uint round_up;} test_params; static test_params test_values[] = { /* this contains many edge values followed by a couple random values */ -{ 0xb, 0x7, 0x3, 0x19 }, -{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000 }, -{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001 }, -{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000 }, -{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000 }, -{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab }, -{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000 }, -{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff }, -{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851 }, -{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554 }, -{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3 }, -{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2 }, -{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007 }, -{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001 }, -{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001 }, -{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000 }, -{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001 }, -{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002 }, -{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8 }, -{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca }, -{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b }, -{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9 }, -{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe }, -{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18 }, -{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35 }, -{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f }, -{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961 }, -{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216 }, +{ 0xb, 0x7, 0x3, 0x19, 1 }, +{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000, 0 }, +{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001, 0 }, +{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000, 1 }, +{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000, 1 }, +{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab, 0 }, +{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000, 1 }, +{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff, 1 }, +{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851, 1 }, +{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554, 1 }, +{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3, 1 }, +{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2, 1 }, +{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007, 1 }, +{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001, 1 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001, 0 }, +{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000, 1 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001, 1 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002, 1 }, +{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8, 1 }, +{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca, 1 }, +{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b, 1 }, +{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9, 1 }, +{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe, 0 }, +{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18, 1 }, +{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35, 1 }, +{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f, 1 }, +{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961, 1 }, +{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216, 1 }, }; /* * The above table can be verified with the following shell script: - * - * #!/bin/sh - * sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4/p' \ - * lib/math/test_mul_u64_u64_div_u64.c | - * while read a b c r; do - * expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $c | bc ) - * given=$( printf "%X\n" $r ) - * if [ "$expected" = "$given" ]; then - * echo "$a * $b / $c = $r OK" - * else - * echo "$a * $b / $c = $r is wrong" >&2 - * echo "should be equivalent to 0x$expected" >&2 - * exit 1 - * fi - * done + +#!/bin/sh +sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4 \5/p' \ + lib/math/test_mul_u64_u64_div_u64.c | +while read a b d r e; do + expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $d | bc ) + given=$( printf "%X\n" $r ) + if [ "$expected" = "$given" ]; then + echo "$a * $b / $d = $r OK" + else + echo "$a * $b / $d = $r is wrong" >&2 + echo "should be equivalent to 0x$expected" >&2 + exit 1 + fi + expected=$( printf "obase=16; ibase=16; (%X * %X + %X) / %X\n" $a $b $((d-1)) $d | bc ) + given=$( printf "%X\n" $((r + e)) ) + if [ "$expected" = "$given" ]; then + echo "$a * $b +/ $d = $(printf '%#x' $((r + e))) OK" + else + echo "$a * $b +/ $d = $(printf '%#x' $((r + e))) is wrong" >&2 + echo "should be equivalent to 0x$expected" >&2 + exit 1 + fi +done + */ static int __init test_init(void) { + int errors = 0; + int tests = 0; int i; pr_info("Starting mul_u64_u64_div_u64() test\n"); @@ -72,19 +84,31 @@ static int __init test_init(void) for (i = 0; i < ARRAY_SIZE(test_values); i++) { u64 a = test_values[i].a; u64 b = test_values[i].b; - u64 c = test_values[i].c; + u64 d = test_values[i].d; u64 expected_result = test_values[i].result; - u64 result = mul_u64_u64_div_u64(a, b, c); + u64 result = mul_u64_u64_div_u64(a, b, d); + u64 result_up = mul_u64_u64_div_u64_roundup(a, b, d); + + tests += 2; if (result != expected_result) { - pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, c); + pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, d); pr_err("ERROR: expected result: %016llx\n", expected_result); pr_err("ERROR: obtained result: %016llx\n", result); + errors++; + } + expected_result += test_values[i].round_up; + if (result_up != expected_result) { + pr_err("ERROR: 0x%016llx * 0x%016llx +/ 0x%016llx\n", a, b, d); + pr_err("ERROR: expected result: %016llx\n", expected_result); + pr_err("ERROR: obtained result: %016llx\n", result_up); + errors++; } } - pr_info("Completed mul_u64_u64_div_u64() test\n"); - return 0; + pr_info("Completed mul_u64_u64_div_u64() test, %d tests, %d errors\n", + tests, errors); + return errors ? -EINVAL : 0; } static void __exit test_exit(void) From f0bff2eb04686f13386b2af97bc7aaa09f020f35 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:32 +0000 Subject: [PATCH 052/139] lib: test_mul_u64_u64_div_u64(): test both generic and arch versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change the #if in div64.c so that test_mul_u64_u64_div_u64.c can compile and test the generic version (including the 'long multiply') on architectures (eg amd64) that define their own copy. Test the kernel version and the locally compiled version on all arch. Output the time taken (in ns) on the 'test completed' trace. For reference, on my zen 5, the optimised version takes ~220ns and the generic version ~3350ns. Using the native multiply saves ~200ns and adding back the ilog2() 'optimisation' test adds ~50ms. Link: https://lkml.kernel.org/r/20251105201035.64043-7-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/div64.c | 8 +++-- lib/math/test_mul_u64_u64_div_u64.c | 52 +++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/lib/math/div64.c b/lib/math/div64.c index a88391b8ada0..18a9ba26c418 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -177,16 +177,18 @@ EXPORT_SYMBOL(div64_s64); * Iterative div/mod for use when dividend is not expected to be much * bigger than divisor. */ +#ifndef iter_div_u64_rem u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { return __iter_div_u64_rem(dividend, divisor, remainder); } EXPORT_SYMBOL(iter_div_u64_rem); +#endif -#ifndef mul_u64_add_u64_div_u64 +#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) { -#if defined(__SIZEOF_INT128__) +#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) /* native 64x64=128 bits multiplication */ u128 prod = (u128)a * b + c; @@ -267,5 +269,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) return res; } +#if !defined(test_mul_u64_add_u64_div_u64) EXPORT_SYMBOL(mul_u64_add_u64_div_u64); #endif +#endif diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c index 4d5e4e5dac67..d8d2c18c4614 100644 --- a/lib/math/test_mul_u64_u64_div_u64.c +++ b/lib/math/test_mul_u64_u64_div_u64.c @@ -73,21 +73,34 @@ done */ -static int __init test_init(void) +static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d); + +static int __init test_run(unsigned int fn_no, const char *fn_name) { + u64 start_time; int errors = 0; int tests = 0; int i; - pr_info("Starting mul_u64_u64_div_u64() test\n"); + start_time = ktime_get_ns(); for (i = 0; i < ARRAY_SIZE(test_values); i++) { u64 a = test_values[i].a; u64 b = test_values[i].b; u64 d = test_values[i].d; u64 expected_result = test_values[i].result; - u64 result = mul_u64_u64_div_u64(a, b, d); - u64 result_up = mul_u64_u64_div_u64_roundup(a, b, d); + u64 result, result_up; + + switch (fn_no) { + default: + result = mul_u64_u64_div_u64(a, b, d); + result_up = mul_u64_u64_div_u64_roundup(a, b, d); + break; + case 1: + result = test_mul_u64_add_u64_div_u64(a, b, 0, d); + result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d); + break; + } tests += 2; @@ -106,15 +119,40 @@ static int __init test_init(void) } } - pr_info("Completed mul_u64_u64_div_u64() test, %d tests, %d errors\n", - tests, errors); - return errors ? -EINVAL : 0; + pr_info("Completed %s() test, %d tests, %d errors, %llu ns\n", + fn_name, tests, errors, ktime_get_ns() - start_time); + return errors; +} + +static int __init test_init(void) +{ + pr_info("Starting mul_u64_u64_div_u64() test\n"); + if (test_run(0, "mul_u64_u64_div_u64")) + return -EINVAL; + if (test_run(1, "test_mul_u64_u64_div_u64")) + return -EINVAL; + return 0; } static void __exit test_exit(void) { } +/* Compile the generic mul_u64_add_u64_div_u64() code */ +#undef __div64_32 +#define __div64_32 __div64_32 +#define div_s64_rem div_s64_rem +#define div64_u64_rem div64_u64_rem +#define div64_u64 div64_u64 +#define div64_s64 div64_s64 +#define iter_div_u64_rem iter_div_u64_rem + +#undef mul_u64_add_u64_div_u64 +#define mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64 +#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64 + +#include "div64.c" + module_init(test_init); module_exit(test_exit); From 630f96a687def5616d6fa7f069adcea158320909 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:33 +0000 Subject: [PATCH 053/139] lib: mul_u64_u64_div_u64(): optimise multiply on 32bit x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a + u32_b). As well as the extra instructions it can generate a lot of spills to stack (including spills of constant zeros and even multiplies by constant zero). mul_u32_u32() already exists to optimise the multiply. Add a similar add_u64_32() for the addition. Disable both for clang - it generates better code without them. Move the 64x64 => 128 multiply into a static inline helper function for code clarity. No need for the a/b_hi/lo variables, the implicit casts on the function calls do the work for us. Should have minimal effect on the generated code. Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply in mul_u64_add_u64_div_u64(). Link: https://lkml.kernel.org/r/20251105201035.64043-8-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- arch/x86/include/asm/div64.h | 19 +++++++++++++++++ include/linux/math64.h | 11 ++++++++++ lib/math/div64.c | 40 +++++++++++++++++++++++------------- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 6d8a3de3f43a..30fd06ede751 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) } #define div_u64_rem div_u64_rem +/* + * gcc tends to zero extend 32bit values and do full 64bit maths. + * Define asm functions that avoid this. + * (clang generates better code for the C versions.) + */ +#ifndef __clang__ static inline u64 mul_u32_u32(u32 a, u32 b) { u32 high, low; @@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b) } #define mul_u32_u32 mul_u32_u32 +static inline u64 add_u64_u32(u64 a, u32 b) +{ + u32 high = a >> 32, low = a; + + asm ("addl %[b], %[low]; adcl $0, %[high]" + : [low] "+r" (low), [high] "+r" (high) + : [b] "rm" (b) ); + + return low | (u64)high << 32; +} +#define add_u64_u32 add_u64_u32 +#endif + /* * __div64_32() is never called on x86, so prevent the * generic definition from getting built. diff --git a/include/linux/math64.h b/include/linux/math64.h index e889d850b7f1..cc305206d89f 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -158,6 +158,17 @@ static inline u64 mul_u32_u32(u32 a, u32 b) } #endif +#ifndef add_u64_u32 +/* + * Many a GCC version also messes this up. + * Zero extending b and then spilling everything to stack. + */ +static inline u64 add_u64_u32(u64 a, u32 b) +{ + return a + b; +} +#endif + #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr diff --git a/lib/math/div64.c b/lib/math/div64.c index 18a9ba26c418..bb57a48ce36a 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -186,33 +186,45 @@ EXPORT_SYMBOL(iter_div_u64_rem); #endif #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) -u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) -{ + +#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c) + #if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) +static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) +{ /* native 64x64=128 bits multiplication */ u128 prod = (u128)a * b + c; - u64 n_lo = prod, n_hi = prod >> 64; + + *p_lo = prod; + return prod >> 64; +} #else - /* perform a 64x64=128 bits multiplication manually */ - u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32; +static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) +{ + /* perform a 64x64=128 bits multiplication in 32bit chunks */ u64 x, y, z; /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */ - x = (u64)a_lo * b_lo + (u32)c; - y = (u64)a_lo * b_hi + (u32)(c >> 32); - y += (u32)(x >> 32); - z = (u64)a_hi * b_hi + (u32)(y >> 32); - y = (u64)a_hi * b_lo + (u32)y; - z += (u32)(y >> 32); - x = (y << 32) + (u32)x; - - u64 n_lo = x, n_hi = z; + x = mul_add(a, b, c); + y = mul_add(a, b >> 32, c >> 32); + y = add_u64_u32(y, x >> 32); + z = mul_add(a >> 32, b >> 32, y >> 32); + y = mul_add(a >> 32, b, y); + *p_lo = (y << 32) + (u32)x; + return add_u64_u32(z, y >> 32); +} #endif +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) +{ + u64 n_lo, n_hi; + + n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c); + if (!n_hi) return div64_u64(n_lo, d); From d10bb374c41e4c4dced04ae7d2fe2d782a5858a0 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:34 +0000 Subject: [PATCH 054/139] lib: mul_u64_u64_div_u64(): optimise the divide code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the bit by bit algorithm with one that generates 16 bits per iteration on 32bit architectures and 32 bits on 64bit ones. On my zen 5 this reduces the time for the tests (using the generic code) from ~3350ns to ~1000ns. Running the 32bit algorithm on 64bit x86 takes ~1500ns. It'll be slightly slower on a real 32bit system, mostly due to register pressure. The savings for 32bit x86 are much higher (tested in userspace). The worst case (lots of bits in the quotient) drops from ~900 clocks to ~130 (pretty much independant of the arguments). Other 32bit architectures may see better savings. It is possibly to optimise for divisors that span less than __LONG_WIDTH__/2 bits. However I suspect they don't happen that often and it doesn't remove any slow cpu divide instructions which dominate the result. Typical improvements for 64bit random divides: old new sandy bridge: 470 150 haswell: 400 144 piledriver: 960 467 I think rdpmc is very slow. zen5: 244 80 (Timing is 'rdpmc; mul_div(); rdpmc' with the multiply depending on the first rdpmc and the second rdpmc depending on the quotient.) Object code (64bit x86 test program): old 0x173 new 0x141. Link: https://lkml.kernel.org/r/20251105201035.64043-9-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/div64.c | 120 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/lib/math/div64.c b/lib/math/div64.c index bb57a48ce36a..d1e92ea24fce 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -190,7 +190,6 @@ EXPORT_SYMBOL(iter_div_u64_rem); #define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c) #if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) - static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) { /* native 64x64=128 bits multiplication */ @@ -199,9 +198,7 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) *p_lo = prod; return prod >> 64; } - #else - static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) { /* perform a 64x64=128 bits multiplication in 32bit chunks */ @@ -216,12 +213,37 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) *p_lo = (y << 32) + (u32)x; return add_u64_u32(z, y >> 32); } +#endif +#ifndef BITS_PER_ITER +#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16) +#endif + +#if BITS_PER_ITER == 32 +#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c) +#define add_u64_long(a, b) ((a) + (b)) +#else +#undef BITS_PER_ITER +#define BITS_PER_ITER 16 +static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) +{ + u64 n_lo = mul_add(a, b, c); + u64 n_med = mul_add(a >> 32, b, c >> 32); + + n_med = add_u64_u32(n_med, n_lo >> 32); + *p_lo = n_med << 32 | (u32)n_lo; + return n_med >> 32; +} + +#define add_u64_long(a, b) add_u64_u32(a, b) #endif u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) { - u64 n_lo, n_hi; + unsigned long d_msig, q_digit; + unsigned int reps, d_z_hi; + u64 quotient, n_lo, n_hi; + u32 overflow; n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c); @@ -240,46 +262,70 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) return ~0ULL; } - int shift = __builtin_ctzll(d); - - /* try reducing the fraction in case the dividend becomes <= 64 bits */ - if ((n_hi >> shift) == 0) { - u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo; - - return div64_u64(n, d >> shift); - /* - * The remainder value if needed would be: - * res = div64_u64_rem(n, d >> shift, &rem); - * rem = (rem << shift) + (n_lo - (n << shift)); - */ + /* Left align the divisor, shifting the dividend to match */ + d_z_hi = __builtin_clzll(d); + if (d_z_hi) { + d <<= d_z_hi; + n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); + n_lo <<= d_z_hi; } - /* Do the full 128 by 64 bits division */ + reps = 64 / BITS_PER_ITER; + /* Optimise loop count for small dividends */ + if (!(u32)(n_hi >> 32)) { + reps -= 32 / BITS_PER_ITER; + n_hi = n_hi << 32 | n_lo >> 32; + n_lo <<= 32; + } +#if BITS_PER_ITER == 16 + if (!(u32)(n_hi >> 48)) { + reps--; + n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); + n_lo <<= 16; + } +#endif - shift = __builtin_clzll(d); - d <<= shift; + /* Invert the dividend so we can use add instead of subtract. */ + n_lo = ~n_lo; + n_hi = ~n_hi; - int p = 64 + shift; - u64 res = 0; - bool carry; + /* + * Get the most significant BITS_PER_ITER bits of the divisor. + * This is used to get a low 'guestimate' of the quotient digit. + */ + d_msig = (d >> (64 - BITS_PER_ITER)) + 1; - do { - carry = n_hi >> 63; - shift = carry ? 1 : __builtin_clzll(n_hi); - if (p < shift) - break; - p -= shift; - n_hi <<= shift; - n_hi |= n_lo >> (64 - shift); - n_lo <<= shift; - if (carry || (n_hi >= d)) { - n_hi -= d; - res |= 1ULL << p; + /* + * Now do a 'long division' with BITS_PER_ITER bit 'digits'. + * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits. + * The worst case is dividing ~0 by 0x8000 which requires two subtracts. + */ + quotient = 0; + while (reps--) { + q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; + /* Shift 'n' left to align with the product q_digit * d */ + overflow = n_hi >> (64 - BITS_PER_ITER); + n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); + n_lo <<= BITS_PER_ITER; + /* Add product to negated divisor */ + overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); + /* Adjust for the q_digit 'guestimate' being low */ + while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) { + q_digit++; + n_hi += d; + overflow += n_hi < d; } - } while (n_hi); - /* The remainder value if needed would be n_hi << p */ + quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); + } - return res; + /* + * The above only ensures the remainder doesn't overflow, + * it can still be possible to add (aka subtract) another copy + * of the divisor. + */ + if ((n_hi + d) > n_hi) + quotient++; + return quotient; } #if !defined(test_mul_u64_add_u64_div_u64) EXPORT_SYMBOL(mul_u64_add_u64_div_u64); From 1d1ef8c1fb5e488c0f68499239d8dc61b1399db9 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:35 +0000 Subject: [PATCH 055/139] lib: test_mul_u64_u64_div_u64(): test the 32bit code on 64bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are slight differences in the mul_u64_add_u64_div_u64() code between 32bit and 64bit systems. Compile and test the 32bit version on 64bit hosts for better test coverage. Link: https://lkml.kernel.org/r/20251105201035.64043-10-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- lib/math/test_mul_u64_u64_div_u64.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c index d8d2c18c4614..338d014f0c73 100644 --- a/lib/math/test_mul_u64_u64_div_u64.c +++ b/lib/math/test_mul_u64_u64_div_u64.c @@ -74,6 +74,10 @@ done */ static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d); +#if __LONG_WIDTH__ >= 64 +#define TEST_32BIT_DIV +static u64 test_mul_u64_add_u64_div_u64_32bit(u64 a, u64 b, u64 c, u64 d); +#endif static int __init test_run(unsigned int fn_no, const char *fn_name) { @@ -100,6 +104,12 @@ static int __init test_run(unsigned int fn_no, const char *fn_name) result = test_mul_u64_add_u64_div_u64(a, b, 0, d); result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d); break; +#ifdef TEST_32BIT_DIV + case 2: + result = test_mul_u64_add_u64_div_u64_32bit(a, b, 0, d); + result_up = test_mul_u64_add_u64_div_u64_32bit(a, b, d - 1, d); + break; +#endif } tests += 2; @@ -131,6 +141,10 @@ static int __init test_init(void) return -EINVAL; if (test_run(1, "test_mul_u64_u64_div_u64")) return -EINVAL; +#ifdef TEST_32BIT_DIV + if (test_run(2, "test_mul_u64_u64_div_u64_32bit")) + return -EINVAL; +#endif return 0; } @@ -153,6 +167,21 @@ static void __exit test_exit(void) #include "div64.c" +#ifdef TEST_32BIT_DIV +/* Recompile the generic code for 32bit long */ +#undef test_mul_u64_add_u64_div_u64 +#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64_32bit +#undef BITS_PER_ITER +#define BITS_PER_ITER 16 + +#define mul_u64_u64_add_u64 mul_u64_u64_add_u64_32bit +#undef mul_u64_long_add_u64 +#undef add_u64_long +#undef mul_add + +#include "div64.c" +#endif + module_init(test_init); module_exit(test_exit); From c9dddd981600538d8c39d58ba89679439231959f Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Wed, 5 Nov 2025 11:20:19 +0100 Subject: [PATCH 056/139] MAINTAINERS: add Pratyush as a reviewer for KHO I have been reviewing most patches for KHO already, and it is easier to spot them if I am directly in Cc. Link: https://lkml.kernel.org/r/20251105102022.18798-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Alexander Graf Cc: Baoquan He Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9a42edbe2033..b945431efaf4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13791,6 +13791,7 @@ KEXEC HANDOVER (KHO) M: Alexander Graf M: Mike Rapoport M: Changyuan Lyu +R: Pratyush Yadav L: kexec@lists.infradead.org L: linux-mm@kvack.org S: Maintained From caa71919a622e3f7d290d4f17ae538b15f5cb6d3 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 6 Nov 2025 13:43:41 +0100 Subject: [PATCH 057/139] scripts/gdb/radix-tree: add lx-radix-tree-command Patch series "scripts/gdb/symbols: make BPF debug info available to GDB", v2. This series greatly simplifies debugging BPF progs when using QEMU gdbstub by providing symbol names, sizes, and line numbers to GDB. Patch 1 adds radix tree iteration, which is necessary for parsing prog_idr. Patch 2 is the actual implementation; its description contains some details on how to use this. This patch (of 2): Add a function and a command to iterate over radix tree contents. Duplicate the C implementation in Python, but drop support for tagging. Link: https://lkml.kernel.org/r/20251106124600.86736-1-iii@linux.ibm.com Link: https://lkml.kernel.org/r/20251106124600.86736-2-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkman Cc: Heiko Carstens Cc: Jan Kiszka Cc: Kieran Bingham Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- scripts/gdb/linux/radixtree.py | 139 +++++++++++++++++++++++++++++++-- 1 file changed, 132 insertions(+), 7 deletions(-) diff --git a/scripts/gdb/linux/radixtree.py b/scripts/gdb/linux/radixtree.py index 074543ac763d..bc2954e45c32 100644 --- a/scripts/gdb/linux/radixtree.py +++ b/scripts/gdb/linux/radixtree.py @@ -30,13 +30,16 @@ def entry_to_node(node): def node_maxindex(node): return (constants.LX_RADIX_TREE_MAP_SIZE << node['shift']) - 1 -def lookup(root, index): +def resolve_root(root): + if root.type == radix_tree_root_type.get_type(): + return root if root.type == radix_tree_root_type.get_type().pointer(): - node = root.dereference() - elif root.type != radix_tree_root_type.get_type(): - raise gdb.GdbError("must be {} not {}" - .format(radix_tree_root_type.get_type(), root.type)) + return root.dereference() + raise gdb.GdbError("must be {} not {}" + .format(radix_tree_root_type.get_type(), root.type)) +def lookup(root, index): + root = resolve_root(root) node = root['xa_head'] if node == 0: return None @@ -71,14 +74,120 @@ def lookup(root, index): return node -class LxRadixTree(gdb.Function): +def descend(parent, index): + offset = (index >> int(parent["shift"])) & constants.LX_RADIX_TREE_MAP_MASK + return offset, parent["slots"][offset] + +def load_root(root): + node = root["xa_head"] + nodep = node + + if is_internal_node(node): + node = entry_to_node(node) + maxindex = node_maxindex(node) + return int(node["shift"]) + constants.LX_RADIX_TREE_MAP_SHIFT, \ + nodep, maxindex + + return 0, nodep, 0 + +class RadixTreeIter: + def __init__(self, start): + self.index = 0 + self.next_index = start + self.node = None + +def xa_mk_internal(v): + return (v << 2) | 2 + +LX_XA_RETRY_ENTRY = xa_mk_internal(256) +LX_RADIX_TREE_RETRY = LX_XA_RETRY_ENTRY + +def next_chunk(root, iter): + mask = (1 << (utils.get_ulong_type().sizeof * 8)) - 1 + + index = iter.next_index + if index == 0 and iter.index != 0: + return None + + restart = True + while restart: + restart = False + + _, child, maxindex = load_root(root) + if index > maxindex: + return None + if not child: + return None + + if not is_internal_node(child): + iter.index = index + iter.next_index = (maxindex + 1) & mask + iter.node = None + return root["xa_head"].address + + while True: + node = entry_to_node(child) + offset, child = descend(node, index) + + if not child: + while True: + offset += 1 + if offset >= constants.LX_RADIX_TREE_MAP_SIZE: + break + slot = node["slots"][offset] + if slot: + break + index &= ~node_maxindex(node) + index = (index + (offset << int(node["shift"]))) & mask + if index == 0: + return None + if offset == constants.LX_RADIX_TREE_MAP_SIZE: + restart = True + break + child = node["slots"][offset] + + if not child: + restart = True + break + if child == LX_XA_RETRY_ENTRY: + break + if not node["shift"] or not is_internal_node(child): + break + + iter.index = (index & ~node_maxindex(node)) | offset + iter.next_index = ((index | node_maxindex(node)) + 1) & mask + iter.node = node + + return node["slots"][offset].address + +def next_slot(slot, iter): + mask = (1 << (utils.get_ulong_type().sizeof * 8)) - 1 + for _ in range(iter.next_index - iter.index - 1): + slot += 1 + iter.index = (iter.index + 1) & mask + if slot.dereference(): + return slot + return None + +def for_each_slot(root, start=0): + iter = RadixTreeIter(start) + slot = None + while True: + if not slot: + slot = next_chunk(root, iter) + if not slot: + break + yield iter.index, slot + slot = next_slot(slot, iter) + +class LxRadixTreeLookup(gdb.Function): """ Lookup and return a node from a RadixTree. $lx_radix_tree_lookup(root_node [, index]): Return the node at the given index. If index is omitted, the root node is dereference and returned.""" def __init__(self): - super(LxRadixTree, self).__init__("lx_radix_tree_lookup") + super(LxRadixTreeLookup, self).__init__("lx_radix_tree_lookup") def invoke(self, root, index=0): result = lookup(root, index) @@ -87,4 +196,20 @@ If index is omitted, the root node is dereference and returned.""" return result +class LxRadixTree(gdb.Command): + """Show all values stored in a RadixTree.""" + + def __init__(self): + super(LxRadixTree, self).__init__("lx-radix-tree", gdb.COMMAND_DATA, + gdb.COMPLETE_NONE) + + def invoke(self, argument, from_tty): + args = gdb.string_to_argv(argument) + if len(args) != 1: + raise gdb.GdbError("Usage: lx-radix-tree ROOT") + root = gdb.parse_and_eval(args[0]) + for index, slot in for_each_slot(root): + gdb.write("[{}] = {}\n".format(index, slot.dereference())) + LxRadixTree() +LxRadixTreeLookup() From 581ee79a254759ea8288057f762389820b39adcc Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 6 Nov 2025 13:43:42 +0100 Subject: [PATCH 058/139] scripts/gdb/symbols: make BPF debug info available to GDB One can debug BPF programs with QEMU gdbstub by setting a breakpoint on bpf_prog_kallsyms_add(), waiting for a hit with a matching aux.name, and then setting a breakpoint on bpf_func. This is tedious, error-prone, and also lacks line numbers. Automate this in a way similar to the existing support for modules in lx-symbols. Enumerate and monitor changes to both BPF kallsyms and JITed progs. For each ksym, generate and compile a synthetic .s file containing the name, code, and size. In addition, if this ksym is also a prog, and not a trampoline, add line number information. Ensure that this is a no-op if the kernel is built without BPF support or if "as" is missing. In theory the "as" dependency may be dropped by generating the synthetic .o file manually, but this is too much complexity for too little benefit. Now one can debug BPF progs out of the box like this: (gdb) lx-symbols -bpf (gdb) b bpf_prog_4e612a6a881a086b_arena_list_add Breakpoint 2 (bpf_prog_4e612a6a881a086b_arena_list_add) pending. # ./test_progs -t arena_list Thread 4 hit Breakpoint 2, bpf_prog_4e612a6a881a086b_arena_list_add () at linux/tools/testing/selftests/bpf/progs/arena_list.c:51 51 list_head = &global_head; (gdb) n bpf_prog_4e612a6a881a086b_arena_list_add () at linux/tools/testing/selftests/bpf/progs/arena_list.c:53 53 for (i = zero; i < cnt && can_loop; i++) { This also works for subprogs. Link: https://lkml.kernel.org/r/20251106124600.86736-3-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkman Cc: Heiko Carstens Cc: Jan Kiszka Cc: Kieran Bingham Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- scripts/gdb/linux/bpf.py | 253 ++++++++++++++++++++++++++++++ scripts/gdb/linux/constants.py.in | 3 + scripts/gdb/linux/symbols.py | 105 +++++++++++-- 3 files changed, 349 insertions(+), 12 deletions(-) create mode 100644 scripts/gdb/linux/bpf.py diff --git a/scripts/gdb/linux/bpf.py b/scripts/gdb/linux/bpf.py new file mode 100644 index 000000000000..1870534ef6f9 --- /dev/null +++ b/scripts/gdb/linux/bpf.py @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json +import subprocess +import tempfile + +import gdb + +from linux import constants, lists, radixtree, utils + + +if constants.LX_CONFIG_BPF and constants.LX_CONFIG_BPF_JIT: + bpf_ksym_type = utils.CachedType("struct bpf_ksym") +if constants.LX_CONFIG_BPF_SYSCALL: + bpf_prog_type = utils.CachedType("struct bpf_prog") + + +def get_ksym_name(ksym): + name = ksym["name"].bytes + end = name.find(b"\x00") + if end != -1: + name = name[:end] + return name.decode() + + +def list_ksyms(): + if not (constants.LX_CONFIG_BPF and constants.LX_CONFIG_BPF_JIT): + return [] + bpf_kallsyms = gdb.parse_and_eval("&bpf_kallsyms") + bpf_ksym_ptr_type = bpf_ksym_type.get_type().pointer() + return list(lists.list_for_each_entry(bpf_kallsyms, + bpf_ksym_ptr_type, + "lnode")) + + +class KsymAddBreakpoint(gdb.Breakpoint): + def __init__(self, monitor): + super(KsymAddBreakpoint, self).__init__("bpf_ksym_add", internal=True) + self.silent = True + self.monitor = monitor + + def stop(self): + self.monitor.add(gdb.parse_and_eval("ksym")) + return False + + +class KsymRemoveBreakpoint(gdb.Breakpoint): + def __init__(self, monitor): + super(KsymRemoveBreakpoint, self).__init__("bpf_ksym_del", + internal=True) + self.silent = True + self.monitor = monitor + + def stop(self): + self.monitor.remove(gdb.parse_and_eval("ksym")) + return False + + +class KsymMonitor: + def __init__(self, add, remove): + self.add = add + self.remove = remove + + self.add_bp = KsymAddBreakpoint(self) + self.remove_bp = KsymRemoveBreakpoint(self) + + self.notify_initial() + + def notify_initial(self): + for ksym in list_ksyms(): + self.add(ksym) + + def delete(self): + self.add_bp.delete() + self.remove_bp.delete() + + +def list_progs(): + if not constants.LX_CONFIG_BPF_SYSCALL: + return [] + idr_rt = gdb.parse_and_eval("&prog_idr.idr_rt") + bpf_prog_ptr_type = bpf_prog_type.get_type().pointer() + progs = [] + for _, slot in radixtree.for_each_slot(idr_rt): + prog = slot.dereference().cast(bpf_prog_ptr_type) + progs.append(prog) + # Subprogs are not registered in prog_idr, fetch them manually. + # func[0] is the current prog. + aux = prog["aux"] + func = aux["func"] + real_func_cnt = int(aux["real_func_cnt"]) + for i in range(1, real_func_cnt): + progs.append(func[i]) + return progs + + +class ProgAddBreakpoint(gdb.Breakpoint): + def __init__(self, monitor): + super(ProgAddBreakpoint, self).__init__("bpf_prog_kallsyms_add", + internal=True) + self.silent = True + self.monitor = monitor + + def stop(self): + self.monitor.add(gdb.parse_and_eval("fp")) + return False + + +class ProgRemoveBreakpoint(gdb.Breakpoint): + def __init__(self, monitor): + super(ProgRemoveBreakpoint, self).__init__("bpf_prog_free_id", + internal=True) + self.silent = True + self.monitor = monitor + + def stop(self): + self.monitor.remove(gdb.parse_and_eval("prog")) + return False + + +class ProgMonitor: + def __init__(self, add, remove): + self.add = add + self.remove = remove + + self.add_bp = ProgAddBreakpoint(self) + self.remove_bp = ProgRemoveBreakpoint(self) + + self.notify_initial() + + def notify_initial(self): + for prog in list_progs(): + self.add(prog) + + def delete(self): + self.add_bp.delete() + self.remove_bp.delete() + + +def btf_str_by_offset(btf, offset): + while offset < btf["start_str_off"]: + btf = btf["base_btf"] + + offset -= btf["start_str_off"] + if offset < btf["hdr"]["str_len"]: + return (btf["strings"] + offset).string() + + return None + + +def bpf_line_info_line_num(line_col): + return line_col >> 10 + + +def bpf_line_info_line_col(line_col): + return line_col & 0x3ff + + +class LInfoIter: + def __init__(self, prog): + # See bpf_prog_get_file_line() for details. + self.pos = 0 + self.nr_linfo = 0 + + if prog is None: + return + + self.bpf_func = int(prog["bpf_func"]) + aux = prog["aux"] + self.btf = aux["btf"] + linfo_idx = aux["linfo_idx"] + self.nr_linfo = int(aux["nr_linfo"]) - linfo_idx + if self.nr_linfo == 0: + return + + linfo_ptr = aux["linfo"] + tpe = linfo_ptr.type.target().array(self.nr_linfo).pointer() + self.linfo = (linfo_ptr + linfo_idx).cast(tpe).dereference() + jited_linfo_ptr = aux["jited_linfo"] + tpe = jited_linfo_ptr.type.target().array(self.nr_linfo).pointer() + self.jited_linfo = (jited_linfo_ptr + linfo_idx).cast(tpe).dereference() + + self.filenos = {} + + def get_code_off(self): + if self.pos >= self.nr_linfo: + return -1 + return self.jited_linfo[self.pos] - self.bpf_func + + def advance(self): + self.pos += 1 + + def get_fileno(self): + file_name_off = int(self.linfo[self.pos]["file_name_off"]) + fileno = self.filenos.get(file_name_off) + if fileno is not None: + return fileno, None + file_name = btf_str_by_offset(self.btf, file_name_off) + fileno = len(self.filenos) + 1 + self.filenos[file_name_off] = fileno + return fileno, file_name + + def get_line_col(self): + line_col = int(self.linfo[self.pos]["line_col"]) + return bpf_line_info_line_num(line_col), \ + bpf_line_info_line_col(line_col) + + +def generate_debug_obj(ksym, prog): + name = get_ksym_name(ksym) + # Avoid read_memory(); it throws bogus gdb.MemoryError in some contexts. + start = ksym["start"] + code = start.cast(gdb.lookup_type("unsigned char") + .array(int(ksym["end"]) - int(start)) + .pointer()).dereference().bytes + linfo_iter = LInfoIter(prog) + + result = tempfile.NamedTemporaryFile(suffix=".o", mode="wb") + try: + with tempfile.NamedTemporaryFile(suffix=".s", mode="w") as src: + # ".loc" does not apply to ".byte"s, only to ".insn"s, but since + # this needs to work for all architectures, the latter are not an + # option. Ask the assembler to apply ".loc"s to labels as well, + # and generate dummy labels after each ".loc". + src.write(".loc_mark_labels 1\n") + + src.write(".globl {}\n".format(name)) + src.write(".type {},@function\n".format(name)) + src.write("{}:\n".format(name)) + for code_off, code_byte in enumerate(code): + if linfo_iter.get_code_off() == code_off: + fileno, file_name = linfo_iter.get_fileno() + if file_name is not None: + src.write(".file {} {}\n".format( + fileno, json.dumps(file_name))) + line, col = linfo_iter.get_line_col() + src.write(".loc {} {} {}\n".format(fileno, line, col)) + src.write("0:\n") + linfo_iter.advance() + src.write(".byte {}\n".format(code_byte)) + src.write(".size {},{}\n".format(name, len(code))) + src.flush() + + try: + subprocess.check_call(["as", "-c", src.name, "-o", result.name]) + except FileNotFoundError: + # "as" is not installed. + result.close() + return None + return result + except: + result.close() + raise diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index c3886739a028..6d475540c6ba 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -170,3 +170,6 @@ LX_CONFIG(CONFIG_PAGE_OWNER) LX_CONFIG(CONFIG_SLUB_DEBUG) LX_CONFIG(CONFIG_SLAB_FREELIST_HARDENED) LX_CONFIG(CONFIG_MMU) +LX_CONFIG(CONFIG_BPF) +LX_CONFIG(CONFIG_BPF_JIT) +LX_CONFIG(CONFIG_BPF_SYSCALL) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 6edb99221675..d4308b726183 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -11,13 +11,14 @@ # This work is licensed under the terms of the GNU GPL version 2. # +import atexit import gdb import os import re import struct from itertools import count -from linux import modules, utils, constants +from linux import bpf, constants, modules, utils if hasattr(gdb, 'Breakpoint'): @@ -114,17 +115,27 @@ class LxSymbols(gdb.Command): The kernel (vmlinux) is taken from the current working directly. Modules (.ko) are scanned recursively, starting in the same directory. Optionally, the module search path can be extended by a space separated list of paths passed to the -lx-symbols command.""" +lx-symbols command. + +When the -bpf flag is specified, symbols from the currently loaded BPF programs +are loaded as well.""" module_paths = [] module_files = [] module_files_updated = False loaded_modules = [] breakpoint = None + bpf_prog_monitor = None + bpf_ksym_monitor = None + bpf_progs = {} + # The remove-symbol-file command, even when invoked with -a, requires the + # respective object file to exist, so keep them around. + bpf_debug_objs = {} def __init__(self): super(LxSymbols, self).__init__("lx-symbols", gdb.COMMAND_FILES, gdb.COMPLETE_FILENAME) + atexit.register(self.cleanup_bpf) def _update_module_files(self): self.module_files = [] @@ -197,6 +208,51 @@ lx-symbols command.""" else: gdb.write("no module object found for '{0}'\n".format(module_name)) + def add_bpf_prog(self, prog): + if prog["jited"]: + self.bpf_progs[int(prog["bpf_func"])] = prog + + def remove_bpf_prog(self, prog): + self.bpf_progs.pop(int(prog["bpf_func"]), None) + + def add_bpf_ksym(self, ksym): + addr = int(ksym["start"]) + name = bpf.get_ksym_name(ksym) + with utils.pagination_off(): + gdb.write("loading @{addr}: {name}\n".format( + addr=hex(addr), name=name)) + debug_obj = bpf.generate_debug_obj(ksym, self.bpf_progs.get(addr)) + if debug_obj is None: + return + try: + cmdline = "add-symbol-file {obj} {addr}".format( + obj=debug_obj.name, addr=hex(addr)) + gdb.execute(cmdline, to_string=True) + except: + debug_obj.close() + raise + self.bpf_debug_objs[addr] = debug_obj + + def remove_bpf_ksym(self, ksym): + addr = int(ksym["start"]) + debug_obj = self.bpf_debug_objs.pop(addr, None) + if debug_obj is None: + return + try: + name = bpf.get_ksym_name(ksym) + gdb.write("unloading @{addr}: {name}\n".format( + addr=hex(addr), name=name)) + cmdline = "remove-symbol-file {path}".format(path=debug_obj.name) + gdb.execute(cmdline, to_string=True) + finally: + debug_obj.close() + + def cleanup_bpf(self): + self.bpf_progs = {} + while len(self.bpf_debug_objs) > 0: + self.bpf_debug_objs.popitem()[1].close() + + def load_all_symbols(self): gdb.write("loading vmlinux\n") @@ -224,34 +280,59 @@ lx-symbols command.""" else: [self.load_module_symbols(module) for module in module_list] + self.cleanup_bpf() + if self.bpf_prog_monitor is not None: + self.bpf_prog_monitor.notify_initial() + if self.bpf_ksym_monitor is not None: + self.bpf_ksym_monitor.notify_initial() + for saved_state in saved_states: saved_state['breakpoint'].enabled = saved_state['enabled'] def invoke(self, arg, from_tty): skip_decompressor() - self.module_paths = [os.path.abspath(os.path.expanduser(p)) - for p in arg.split()] + monitor_bpf = False + self.module_paths = [] + for p in arg.split(): + if p == "-bpf": + monitor_bpf = True + else: + p.append(os.path.abspath(os.path.expanduser(p))) self.module_paths.append(os.getcwd()) + if self.breakpoint is not None: + self.breakpoint.delete() + self.breakpoint = None + if self.bpf_prog_monitor is not None: + self.bpf_prog_monitor.delete() + self.bpf_prog_monitor = None + if self.bpf_ksym_monitor is not None: + self.bpf_ksym_monitor.delete() + self.bpf_ksym_monitor = None + # enforce update self.module_files = [] self.module_files_updated = False self.load_all_symbols() - if not modules.has_modules(): + if not hasattr(gdb, 'Breakpoint'): + gdb.write("Note: symbol update on module and BPF loading not " + "supported with this gdb version\n") return - if hasattr(gdb, 'Breakpoint'): - if self.breakpoint is not None: - self.breakpoint.delete() - self.breakpoint = None + if modules.has_modules(): self.breakpoint = LoadModuleBreakpoint( "kernel/module/main.c:do_init_module", self) - else: - gdb.write("Note: symbol update on module loading not supported " - "with this gdb version\n") + + if monitor_bpf: + if constants.LX_CONFIG_BPF_SYSCALL: + self.bpf_prog_monitor = bpf.ProgMonitor(self.add_bpf_prog, + self.remove_bpf_prog) + if constants.LX_CONFIG_BPF and constants.LX_CONFIG_BPF_JIT: + self.bpf_ksym_monitor = bpf.KsymMonitor(self.add_bpf_ksym, + self.remove_bpf_ksym) LxSymbols() From f3fb126fdc9e148da38a6e25d7fc609774a99fc3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 Nov 2025 16:20:51 +0100 Subject: [PATCH 059/139] math.h: amend abs() kernel-doc and add a note about signed type limits - amend the kernel-doc so the description is decoupled from the parameter descriptions. - add a note to explain behaviour for the signed types when supplied value is the minimum (e.g., INT_MIN for int type). Link: https://lkml.kernel.org/r/20251106152051.2361551-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Jonathan Cameron Signed-off-by: Andrew Morton --- include/linux/math.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/math.h b/include/linux/math.h index 0198c92cbe3e..6dc1d1d32fbc 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -148,11 +148,16 @@ __STRUCT_FRACT(u32) /** * abs - return absolute value of an argument - * @x: the value. If it is unsigned type, it is converted to signed type first. - * char is treated as if it was signed (regardless of whether it really is) - * but the macro's return type is preserved as char. + * @x: the value. * - * Return: an absolute value of x. + * If it is unsigned type, @x is converted to signed type first. + * char is treated as if it was signed (regardless of whether it really is) + * but the macro's return type is preserved as char. + * + * NOTE, for signed type if @x is the minimum, the returned result is undefined + * as there is not enough bits to represent it as a positive number. + * + * Return: an absolute value of @x. */ #define abs(x) __abs_choose_expr(x, long long, \ __abs_choose_expr(x, long, \ From 14954cd190e8cd5664a25984179b1a977615c9ed Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Thu, 6 Nov 2025 09:07:34 +0800 Subject: [PATCH 060/139] fs/proc/page: remove unused KPMBITS KPMBITS is never referenced in the code. Just remove it. Link: https://lkml.kernel.org/r/20251106010735.1603-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: zhang jiao Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: Liu Ye Cc: Luiz Capitulino Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- fs/proc/page.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index fc64f23e05e5..f9b2c2c906cd 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -20,7 +20,6 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) -#define KPMBITS (KPMSIZE * BITS_PER_BYTE) enum kpage_operation { KPAGE_FLAGS, From 1ab980e90ce31f61067eae11772670f1dcdf5a8f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 8 Nov 2025 00:32:49 +0900 Subject: [PATCH 061/139] MAINTAINERS: update nilfs2 entry Viacheslav has kindly offered to help with the maintenance of nilfs2 by upstreaming patches, similar to the HFS/HFS+ tree. I've accepted his offer, and will therefore add him as a co-maintainer and switch the project's git tree for that role. At the same time, change the outdated status field to Maintained to reflect the current state. Link: https://lkml.kernel.org/r/20251107153530.9023-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Acked-by: Viacheslav Dubeyko Signed-off-by: Andrew Morton --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index b945431efaf4..5de1ae30dc40 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18258,10 +18258,11 @@ F: net/sunrpc/ NILFS2 FILESYSTEM M: Ryusuke Konishi +M: Viacheslav Dubeyko L: linux-nilfs@vger.kernel.org -S: Supported +S: Maintained W: https://nilfs.sourceforge.io/ -T: git https://github.com/konis/nilfs2.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/nilfs2.git F: Documentation/filesystems/nilfs2.rst F: fs/nilfs2/ F: include/trace/events/nilfs2.h From 242b872239f6a7deacbc20ab9406ea40cb738ec6 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Sun, 9 Nov 2025 16:37:15 +0800 Subject: [PATCH 062/139] include/linux/once_lite.h: fix judgment in WARN_ONCE with clang For c code: ```c extern int xx; void test(void) { if (WARN_ONCE(xx, "x")) __asm__ volatile ("nop":::); } ``` Clang will generate the following assembly code: ```assemble test: movl xx(%rip), %eax // Assume xx == 0 (likely case) testl %eax, %eax // judge once je .LBB0_3 // jump to .LBB0_3 testb $1, test.__already_done(%rip) je .LBB0_2 .LBB0_3: testl %eax, %eax // judge again je .LBB0_5 // jump to .LBB0_5 .LBB0_4: nop .LBB0_5: retq // omit ``` In the above code, `xx == 0` should be a likely case, but in this case, xx has been judged twice. Test info: 1. kernel source: linux-next commit 9c0826a5d9aa4d52206d ("Add linux-next specific files for 20251107") 2. compiler: clang: Debian clang version 21.1.4 (8) with Debian LLD 21.1.4 (compatible with GNU linkers) 3. config: base on default x86_64_defconfig, and setting: CONFIG_MITIGATION_RETHUNK=n CONFIG_STACKPROTECTOR=n Add unlikely to __ret_cond to help the compiler optimize correctly. [akpm@linux-foundation.org: undo whitespace changes] Link: https://lkml.kernel.org/r/20251109083715.24495-1-qq570070308@gmail.com Signed-off-by: Xie Yuanbin Cc: Bill Wendling Cc: Jan Kara Cc: Justin Stitt Cc: Maninder Singh Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/once_lite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h index 27de7bc32a06..236592c4eeb1 100644 --- a/include/linux/once_lite.h +++ b/include/linux/once_lite.h @@ -16,7 +16,7 @@ bool __ret_cond = !!(condition); \ bool __ret_once = false; \ \ - if (unlikely(__ret_cond && !__already_done)) { \ + if (unlikely(__ret_cond) && unlikely(!__already_done)) {\ __already_done = true; \ __ret_once = true; \ } \ From b50144900a56404fb784c9fd6cddeeee4093b383 Mon Sep 17 00:00:00 2001 From: Gustavo Padovan Date: Wed, 12 Nov 2025 10:43:30 -0300 Subject: [PATCH 063/139] MAINTAINERS: remove Gustavo from sync framework I haven't been involved in the work anymore for some time. It is only fair that I remove myself from it and let other continue to take care of it. Link: https://lkml.kernel.org/r/20251112134330.64130-1-gustavo.padovan@collabora.com Signed-off-by: Gustavo Padovan Cc: Sumit Semwal Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5de1ae30dc40..a56bc68ab369 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24826,7 +24826,6 @@ F: drivers/regulator/sy8106a-regulator.c SYNC FILE FRAMEWORK M: Sumit Semwal -R: Gustavo Padovan L: linux-media@vger.kernel.org L: dri-devel@lists.freedesktop.org S: Maintained From 9ab38c5216634d8adb22156ddbd32f2d195b27a7 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 13 Nov 2025 19:34:13 +0000 Subject: [PATCH 064/139] Revert "lib/plist.c: enforce memory ordering in plist_check_list" This reverts commit 7abcb84f953df037d40fad66f2109db318dd155b. The introduction of WRITE_ONCE() calls for the 'prev' and 'next' variables inside plist_check_list() was a misapplication. WRITE_ONCE() is fundamentally a compiler barrier designed to prevent compiler optimizations (like caching or reordering) on shared memory locations. However, the variables 'prev' and 'next' are local, stack-allocated pointers accessed only by the current thread's invocation of the function. Since these pointers are thread-local and are never accessed concurrently, applying WRITE_ONCE() to them is semantically incorrect and unnecessary. Furthermore, the use of WRITE_ONCE() on local variables prevents the compiler from performing standard optimizations, such as keeping these variables cached solely in CPU registers throughout the loop, potentially introducing performance overhead. Restore the conventional C assignment for local loop variables, allowing the compiler to generate optimal code. Link: https://lkml.kernel.org/r/20251113193413.499309-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: I Hsin Cheng Cc: Ingo Molnar Signed-off-by: Andrew Morton --- lib/plist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/plist.c b/lib/plist.c index 330febb4bd7d..ba677c31e8f3 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -47,8 +47,8 @@ static void plist_check_list(struct list_head *top) plist_check_prev_next(top, prev, next); while (next != top) { - WRITE_ONCE(prev, next); - WRITE_ONCE(next, prev->next); + prev = next; + next = prev->next; plist_check_prev_next(top, prev, next); } } From 5f264c00b669b934300dff506d0aa9f6f8f8c53e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 13 Nov 2025 19:10:36 +0800 Subject: [PATCH 065/139] docs: panic: correct some sys_ifo names in sysctl doc Patch series "Enable hung_task and lockup cases to dump system info on demand", v2. When working on kernel stability issues: panic, task-hung and soft/hard lockup are frequently met. And to debug them, user may need lots of system information at that time, like task call stacks, lock info, memory info, ftrace dump, etc. panic case already uses sys_info() for this purpose, and has a 'panic_sys_info' sysctl(also support cmdline setup) interface to take human readable string like "tasks,mem,timers,locks,ftrace,..." to control what kinds of information is needed. Which is also helpful to debug task-hung and lockup cases. So this patchset introduces the similar sys_info sysctl interface for task-hung and lockup cases. his is mainly for debugging and the info dumping could be intrusive, like dumping call stack for all tasks when system has huge number of tasks, similarly for ftrace dump (we may add tracing_stop() and tracing_start() around it) Locally these have been used in our bug chasing for stability issues and were helpful. As Andrew suggested, add a configurable global 'kernel_sys_info' knob. When error scenarios like panic/hung-task/lockup etc doesn't setup their own sys_info knob and calls sys_info() with parameter "0", this global knob will take effect. It could be used for other kernel cases like OOM, which may not need one dedicated sys_info knob. This patch (of 4): Some sys_info names wered forgotten to change in patch iterations, while the right names are defined in kernel/sys_info.c. Link: https://lkml.kernel.org/r/20251113111039.22701-1-feng.tang@linux.alibaba.com Link: https://lkml.kernel.org/r/20251113111039.22701-2-feng.tang@linux.alibaba.com Fixes: d747755917bf ("panic: add 'panic_sys_info' sysctl to take human readable string parameter") Signed-off-by: Feng Tang Reviewed-by: Petr Mladek Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 0065a55bc09e..a397eeccaea7 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -911,8 +911,8 @@ to 'panic_print'. Possible values are: ============= =================================================== tasks print all tasks info mem print system memory info -timer print timers info -lock print locks info if CONFIG_LOCKDEP is on +timers print timers info +locks print locks info if CONFIG_LOCKDEP is on ftrace print ftrace buffer all_bt print all CPUs backtrace (if available in the arch) blocked_tasks print only tasks in uninterruptible (blocked) state From 8b2b9b4f6f4f7a61b7e323479ed7d9faa21d6287 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 13 Nov 2025 19:10:37 +0800 Subject: [PATCH 066/139] hung_task: add hung_task_sys_info sysctl to dump sys info on task-hung When task-hung happens, developers may need different kinds of system information (call-stacks, memory info, locks, etc.) to help debugging. Add 'hung_task_sys_info' sysctl knob to take human readable string like "tasks,mem,timers,locks,ftrace,...", and when task-hung happens, all requested information will be dumped. (refer kernel/sys_info.c for more details). Meanwhile, the newly introduced sys_info() call is used to unify some existing info-dumping knobs. [feng.tang@linux.alibaba.com: maintain consistecy established behavior, per Lance and Petr] Link: https://lkml.kernel.org/r/aRncJo1mA5Zk77Hr@U-2FWC9VHC-2323.local Link: https://lkml.kernel.org/r/20251113111039.22701-3-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Reviewed-by: Lance Yang Cc: Jonathan Corbet Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 5 +++ kernel/hung_task.c | 40 ++++++++++++++------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a397eeccaea7..45b4408dad31 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -422,6 +422,11 @@ the system boot. This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. +hung_task_sys_info +================== +A comma separated list of extra system information to be dumped when +hung task is detected, for example, "tasks,mem,timers,locks,...". +Refer 'panic_sys_info' section below for more details. hung_task_timeout_secs ====================== diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 5ac0e66a1361..d2254c91450b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -59,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; -static bool hung_task_show_lock; static bool hung_task_call_panic; -static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +/* + * A bitmask to control what kinds of system info to be printed when + * a hung task is detected, it could be task, memory, lock etc. Refer + * include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hung_task_si_mask; + #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? @@ -236,7 +242,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout, if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { console_verbose(); - hung_task_show_lock = true; hung_task_call_panic = true; } @@ -259,10 +264,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout, " disables this message.\n"); sched_show_task(t); debug_show_blocker(t, timeout); - hung_task_show_lock = true; - if (sysctl_hung_task_all_cpu_backtrace) - hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } @@ -304,6 +306,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) unsigned long last_break = jiffies; struct task_struct *g, *t; unsigned long prev_detect_count = sysctl_hung_task_detect_count; + int need_warning = sysctl_hung_task_warnings; + unsigned long si_mask = hung_task_si_mask; /* * If the system crashed already then all bets are off, @@ -312,7 +316,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - hung_task_show_lock = false; + rcu_read_lock(); for_each_process_thread(g, t) { @@ -328,14 +332,19 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); - if (hung_task_show_lock) - debug_show_all_locks(); - if (hung_task_show_all_bt) { - hung_task_show_all_bt = false; - trigger_all_cpu_backtrace(); + if (!(sysctl_hung_task_detect_count - prev_detect_count)) + return; + + if (need_warning || hung_task_call_panic) { + si_mask |= SYS_INFO_LOCKS; + + if (sysctl_hung_task_all_cpu_backtrace) + si_mask |= SYS_INFO_ALL_BT; } + sys_info(si_mask); + if (hung_task_call_panic) panic("hung_task: blocked tasks"); } @@ -434,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "hung_task_sys_info", + .data = &hung_task_si_mask, + .maxlen = sizeof(hung_task_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static void __init hung_task_sysctl_init(void) From a9af76a78760717361cccc884dc649e30db61c8b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 13 Nov 2025 19:10:38 +0800 Subject: [PATCH 067/139] watchdog: add sys_info sysctls to dump sys info on system lockup When soft/hard lockup happens, developers may need different kinds of system information (call-stacks, memory info, locks, etc.) to help debugging. Add 'softlockup_sys_info' and 'hardlockup_sys_info' sysctl knobs to take human readable string like "tasks,mem,timers,locks,ftrace,...", and when system lockup happens, all requested information will be printed out. (refer kernel/sys_info.c for more details). Link: https://lkml.kernel.org/r/20251113111039.22701-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Reviewed-by: Petr Mladek Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 5 +++ kernel/watchdog.c | 44 +++++++++++++++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 45b4408dad31..176520283f1a 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -582,6 +582,11 @@ if leaking kernel pointer values to unprivileged users is a concern. When ``kptr_restrict`` is set to 2, kernel pointers printed using %pK will be replaced with 0s regardless of privileges. +softlockup_sys_info & hardlockup_sys_info +========================================= +A comma separated list of extra system information to be dumped when +soft/hard lockup is detected, for example, "tasks,mem,timers,locks,...". +Refer 'panic_sys_info' section below for more details. modprobe ======== diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5b62d1002783..1f59b950c475 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); +/* + * bitmasks to control what kinds of system info to be printed when + * hard lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hardlockup_si_mask; + #ifdef CONFIG_SYSFS static unsigned int hardlockup_count; @@ -178,11 +186,15 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { + int hardlockup_all_cpu_backtrace; + if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } + hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_hardlockup_all_cpu_backtrace; /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have @@ -205,7 +217,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } @@ -234,12 +246,13 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) trigger_single_cpu_backtrace(cpu); } - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -330,6 +343,13 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +/* + * bitmasks to control what kinds of system info to be printed when + * soft lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long softlockup_si_mask; + static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ @@ -746,7 +766,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; - int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + int softlockup_all_cpu_backtrace; unsigned long flags; if (!watchdog_enabled) @@ -758,6 +778,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (panic_in_progress()) return HRTIMER_NORESTART; + softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_softlockup_all_cpu_backtrace; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ @@ -846,6 +869,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); + sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); if (softlockup_panic) panic("softlockup: hung tasks"); } @@ -1197,6 +1221,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "softlockup_sys_info", + .data = &softlockup_si_mask, + .maxlen = sizeof(softlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", @@ -1219,6 +1250,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "hardlockup_sys_info", + .data = &hardlockup_si_mask, + .maxlen = sizeof(hardlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", From 03ef32d665e8a23d7ce5965b8b035666cfb47866 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 13 Nov 2025 19:10:39 +0800 Subject: [PATCH 068/139] sys_info: add a default kernel sys_info mask Which serves as a global default sys_info mask. When users want the same system information for many error cases (panic, hung, lockup ...), they can chose to set this global knob only once, while not setting up each individual sys_info knobs. This just adds a 'lazy' option, and doesn't change existing kernel behavior as the mask is 0 by default. Link: https://lkml.kernel.org/r/20251113111039.22701-5-feng.tang@linux.alibaba.com Suggested-by: Andrew Morton Signed-off-by: Feng Tang Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 9 ++++++ lib/sys_info.c | 31 ++++++++++++++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 176520283f1a..239da22c4e28 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -521,6 +521,15 @@ default), only processes with the CAP_SYS_ADMIN capability may create io_uring instances. +kernel_sys_info +=============== +A comma separated list of extra system information to be dumped when +soft/hard lockup is detected, for example, "tasks,mem,timers,locks,...". +Refer 'panic_sys_info' section below for more details. + +It serves as the default kernel control knob, which will take effect +when a kernel module calls sys_info() with parameter==0. + kexec_load_disabled =================== diff --git a/lib/sys_info.c b/lib/sys_info.c index 323624093e54..f32a06ec9ed4 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -24,6 +24,13 @@ static const char * const si_names[] = { [ilog2(SYS_INFO_BLOCKED_TASKS)] = "blocked_tasks", }; +/* + * Default kernel sys_info mask. + * If a kernel module calls sys_info() with "parameter == 0", then + * this mask will be used. + */ +static unsigned long kernel_si_mask; + /* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */ unsigned long sys_info_parse_param(char *str) { @@ -110,9 +117,26 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, else return sys_info_read_handler(&table, buffer, lenp, ppos, ro_table->data); } + +static const struct ctl_table sys_info_sysctls[] = { + { + .procname = "kernel_sys_info", + .data = &kernel_si_mask, + .maxlen = sizeof(kernel_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, +}; + +static int __init sys_info_sysctl_init(void) +{ + register_sysctl_init("kernel", sys_info_sysctls); + return 0; +} +subsys_initcall(sys_info_sysctl_init); #endif -void sys_info(unsigned long si_mask) +static void __sys_info(unsigned long si_mask) { if (si_mask & SYS_INFO_TASKS) show_state(); @@ -135,3 +159,8 @@ void sys_info(unsigned long si_mask) if (si_mask & SYS_INFO_BLOCKED_TASKS) show_state_filter(TASK_UNINTERRUPTIBLE); } + +void sys_info(unsigned long si_mask) +{ + __sys_info(si_mask ? : kernel_si_mask); +} From f1e2ca801c54dfc09d6a5540207cec25e8d43f6f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 14 Nov 2025 14:00:45 +0800 Subject: [PATCH 069/139] lib/base64: add support for multiple variants Patch series " lib/base64: add generic encoder/decoder, migrate users", v5. This series introduces a generic Base64 encoder/decoder to the kernel library, eliminating duplicated implementations and delivering significant performance improvements. The Base64 API has been extended to support multiple variants (Standard, URL-safe, and IMAP) as defined in RFC 4648 and RFC 3501. The API now takes a variant parameter and an option to control padding. As part of this series, users are migrated to the new interface while preserving their specific formats: fscrypt now uses BASE64_URLSAFE, Ceph uses BASE64_IMAP, and NVMe is updated to BASE64_STD. On the encoder side, the implementation processes input in 3-byte blocks, mapping 24 bits directly to 4 output symbols. This avoids bit-by-bit streaming and reduces loop overhead, achieving about a 2.7x speedup compared to previous implementations. On the decoder side, replace strchr() lookups with per-variant reverse tables and process input in 4-character groups. Each group is mapped to numeric values and combined into 3 bytes. Padded and unpadded forms are validated explicitly, rejecting invalid '=' usage and enforcing tail rules. This improves throughput by ~43-52x. This patch (of 6): Extend the base64 API to support multiple variants (standard, URL-safe, and IMAP) as defined in RFC 4648 and RFC 3501. The API now takes a variant parameter and an option to control padding. Update NVMe auth code to use the new interface with BASE64_STD. Link: https://lkml.kernel.org/r/20251114055829.87814-1-409411716@gms.tku.edu.tw Link: https://lkml.kernel.org/r/20251114060045.88792-1-409411716@gms.tku.edu.tw Signed-off-by: Kuan-Wei Chiu Co-developed-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Reviewed-by: David Laight Cc: Christoph Hellwig Cc: Eric Biggers Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Keith Busch Cc: Sagi Grimberg Cc: "Theodore Y. Ts'o" Cc: Viacheslav Dubeyko Cc: Xiubo Li Cc: Yu-Sheng Huang Signed-off-by: Andrew Morton --- drivers/nvme/common/auth.c | 4 +-- include/linux/base64.h | 10 ++++-- lib/base64.c | 62 ++++++++++++++++++++++---------------- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 1f51fbebd9fa..e07e7d4bf8b6 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -178,7 +178,7 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, if (!key) return ERR_PTR(-ENOMEM); - key_len = base64_decode(secret, allocated_len, key->key); + key_len = base64_decode(secret, allocated_len, key->key, true, BASE64_STD); if (key_len < 0) { pr_debug("base64 key decoding error %d\n", key_len); @@ -663,7 +663,7 @@ int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, if (ret) goto out_free_digest; - ret = base64_encode(digest, digest_len, enc); + ret = base64_encode(digest, digest_len, enc, true, BASE64_STD); if (ret < hmac_len) { ret = -ENOKEY; goto out_free_digest; diff --git a/include/linux/base64.h b/include/linux/base64.h index 660d4cb1ef31..a2c6c9222da3 100644 --- a/include/linux/base64.h +++ b/include/linux/base64.h @@ -8,9 +8,15 @@ #include +enum base64_variant { + BASE64_STD, /* RFC 4648 (standard) */ + BASE64_URLSAFE, /* RFC 4648 (base64url) */ + BASE64_IMAP, /* RFC 3501 */ +}; + #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) -int base64_encode(const u8 *src, int len, char *dst); -int base64_decode(const char *src, int len, u8 *dst); +int base64_encode(const u8 *src, int len, char *dst, bool padding, enum base64_variant variant); +int base64_decode(const char *src, int len, u8 *dst, bool padding, enum base64_variant variant); #endif /* _LINUX_BASE64_H */ diff --git a/lib/base64.c b/lib/base64.c index b736a7a431c5..a7c20a8e8e98 100644 --- a/lib/base64.c +++ b/lib/base64.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 /* - * base64.c - RFC4648-compliant base64 encoding + * base64.c - Base64 with support for multiple variants * * Copyright (c) 2020 Hannes Reinecke, SUSE * * Based on the base64url routines from fs/crypto/fname.c - * (which are using the URL-safe base64 encoding), - * modified to use the standard coding table from RFC4648 section 4. + * (which are using the URL-safe Base64 encoding), + * modified to support multiple Base64 variants. */ #include @@ -15,26 +15,31 @@ #include #include -static const char base64_table[65] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char base64_tables[][65] = { + [BASE64_STD] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", + [BASE64_URLSAFE] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_", + [BASE64_IMAP] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,", +}; /** - * base64_encode() - base64-encode some binary data + * base64_encode() - Base64-encode some binary data * @src: the binary data to encode * @srclen: the length of @src in bytes - * @dst: (output) the base64-encoded string. Not NUL-terminated. + * @dst: (output) the Base64-encoded string. Not NUL-terminated. + * @padding: whether to append '=' padding characters + * @variant: which base64 variant to use * - * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified - * by RFC 4648, including the '='-padding. + * Encodes data using the selected Base64 variant. * - * Return: the length of the resulting base64-encoded string in bytes. + * Return: the length of the resulting Base64-encoded string in bytes. */ -int base64_encode(const u8 *src, int srclen, char *dst) +int base64_encode(const u8 *src, int srclen, char *dst, bool padding, enum base64_variant variant) { u32 ac = 0; int bits = 0; int i; char *cp = dst; + const char *base64_table = base64_tables[variant]; for (i = 0; i < srclen; i++) { ac = (ac << 8) | src[i]; @@ -48,44 +53,49 @@ int base64_encode(const u8 *src, int srclen, char *dst) *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; bits -= 6; } - while (bits < 0) { - *cp++ = '='; - bits += 2; + if (padding) { + while (bits < 0) { + *cp++ = '='; + bits += 2; + } } return cp - dst; } EXPORT_SYMBOL_GPL(base64_encode); /** - * base64_decode() - base64-decode a string + * base64_decode() - Base64-decode a string * @src: the string to decode. Doesn't need to be NUL-terminated. * @srclen: the length of @src in bytes * @dst: (output) the decoded binary data + * @padding: whether to append '=' padding characters + * @variant: which base64 variant to use * - * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding" - * specified by RFC 4648, including the '='-padding. + * Decodes a string using the selected Base64 variant. * * This implementation hasn't been optimized for performance. * * Return: the length of the resulting decoded binary data in bytes, - * or -1 if the string isn't a valid base64 string. + * or -1 if the string isn't a valid Base64 string. */ -int base64_decode(const char *src, int srclen, u8 *dst) +int base64_decode(const char *src, int srclen, u8 *dst, bool padding, enum base64_variant variant) { u32 ac = 0; int bits = 0; int i; u8 *bp = dst; + const char *base64_table = base64_tables[variant]; for (i = 0; i < srclen; i++) { const char *p = strchr(base64_table, src[i]); - - if (src[i] == '=') { - ac = (ac << 6); - bits += 6; - if (bits >= 8) - bits -= 8; - continue; + if (padding) { + if (src[i] == '=') { + ac = (ac << 6); + bits += 6; + if (bits >= 8) + bits -= 8; + continue; + } } if (p == NULL || src[i] == 0) return -1; From c4eb7ad32eab13ba64cc452c6f43d518b63f5e03 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 14 Nov 2025 14:01:07 +0800 Subject: [PATCH 070/139] lib/base64: optimize base64_decode() with reverse lookup tables Replace the use of strchr() in base64_decode() with precomputed reverse lookup tables for each variant. This avoids repeated string scans and improves performance. Use -1 in the tables to mark invalid characters. Decode: 64B ~1530ns -> ~80ns (~19.1x) 1KB ~27726ns -> ~1239ns (~22.4x) [akpm@linux-foundation.org: fix kernedoc] Link: https://lkml.kernel.org/r/20251114060107.89026-1-409411716@gms.tku.edu.tw Signed-off-by: Kuan-Wei Chiu Co-developed-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Reviewed-by: David Laight Cc: Christoph Hellwig Cc: Eric Biggers Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Keith Busch Cc: Sagi Grimberg Cc: "Theodore Y. Ts'o" Cc: Viacheslav Dubeyko Cc: Xiubo Li Cc: Yu-Sheng Huang Signed-off-by: Andrew Morton --- lib/base64.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/lib/base64.c b/lib/base64.c index a7c20a8e8e98..2d0b775def64 100644 --- a/lib/base64.c +++ b/lib/base64.c @@ -21,6 +21,50 @@ static const char base64_tables[][65] = { [BASE64_IMAP] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,", }; +/* + * Initialize the base64 reverse mapping for a single character + * This macro maps a character to its corresponding base64 value, + * returning -1 if the character is invalid. + * char 'A'-'Z' maps to 0-25, 'a'-'z' maps to 26-51, '0'-'9' maps to 52-61, + * ch_62 maps to 62, ch_63 maps to 63, and other characters return -1 + */ +#define INIT_1(v, ch_62, ch_63) \ + [v] = (v) >= 'A' && (v) <= 'Z' ? (v) - 'A' \ + : (v) >= 'a' && (v) <= 'z' ? (v) - 'a' + 26 \ + : (v) >= '0' && (v) <= '9' ? (v) - '0' + 52 \ + : (v) == (ch_62) ? 62 : (v) == (ch_63) ? 63 : -1 + +/* + * Recursive macros to generate multiple Base64 reverse mapping table entries. + * Each macro generates a sequence of entries in the lookup table: + * INIT_2 generates 2 entries, INIT_4 generates 4, INIT_8 generates 8, and so on up to INIT_32. + */ +#define INIT_2(v, ...) INIT_1(v, __VA_ARGS__), INIT_1((v) + 1, __VA_ARGS__) +#define INIT_4(v, ...) INIT_2(v, __VA_ARGS__), INIT_2((v) + 2, __VA_ARGS__) +#define INIT_8(v, ...) INIT_4(v, __VA_ARGS__), INIT_4((v) + 4, __VA_ARGS__) +#define INIT_16(v, ...) INIT_8(v, __VA_ARGS__), INIT_8((v) + 8, __VA_ARGS__) +#define INIT_32(v, ...) INIT_16(v, __VA_ARGS__), INIT_16((v) + 16, __VA_ARGS__) + +#define BASE64_REV_INIT(ch_62, ch_63) { \ + [0 ... 0x1f] = -1, \ + INIT_32(0x20, ch_62, ch_63), \ + INIT_32(0x40, ch_62, ch_63), \ + INIT_32(0x60, ch_62, ch_63), \ + [0x80 ... 0xff] = -1 } + +static const s8 base64_rev_maps[][256] = { + [BASE64_STD] = BASE64_REV_INIT('+', '/'), + [BASE64_URLSAFE] = BASE64_REV_INIT('-', '_'), + [BASE64_IMAP] = BASE64_REV_INIT('+', ',') +}; + +#undef BASE64_REV_INIT +#undef INIT_32 +#undef INIT_16 +#undef INIT_8 +#undef INIT_4 +#undef INIT_2 +#undef INIT_1 /** * base64_encode() - Base64-encode some binary data * @src: the binary data to encode @@ -84,10 +128,9 @@ int base64_decode(const char *src, int srclen, u8 *dst, bool padding, enum base6 int bits = 0; int i; u8 *bp = dst; - const char *base64_table = base64_tables[variant]; + s8 ch; for (i = 0; i < srclen; i++) { - const char *p = strchr(base64_table, src[i]); if (padding) { if (src[i] == '=') { ac = (ac << 6); @@ -97,9 +140,10 @@ int base64_decode(const char *src, int srclen, u8 *dst, bool padding, enum base6 continue; } } - if (p == NULL || src[i] == 0) + ch = base64_rev_maps[variant][(u8)src[i]]; + if (ch == -1) return -1; - ac = (ac << 6) | (p - base64_table); + ac = (ac << 6) | ch; bits += 6; if (bits >= 8) { bits -= 8; From 9c7d3cf94d33927b6e4e576e7090a929e7162d05 Mon Sep 17 00:00:00 2001 From: Guan-Chun Wu <409411716@gms.tku.edu.tw> Date: Fri, 14 Nov 2025 14:01:32 +0800 Subject: [PATCH 071/139] lib/base64: rework encode/decode for speed and stricter validation The old base64 implementation relied on a bit-accumulator loop, which was slow for larger inputs and too permissive in validation. It would accept extra '=', missing '=', or even '=' appearing in the middle of the input, allowing malformed strings to pass. This patch reworks the internals to improve performance and enforce stricter validation. Changes: - Encoder: * Process input in 3-byte blocks, mapping 24 bits into four 6-bit symbols, avoiding bit-by-bit shifting and reducing loop iterations. * Handle the final 1-2 leftover bytes explicitly and emit '=' only when requested. - Decoder: * Based on the reverse lookup tables from the previous patch, decode input in 4-character groups. * Each group is looked up directly, converted into numeric values, and combined into 3 output bytes. * Explicitly handle padded and unpadded forms: - With padding: input length must be a multiple of 4, and '=' is allowed only in the last two positions. Reject stray or early '='. - Without padding: validate tail lengths (2 or 3 chars) and require unused low bits to be zero. * Removed the bit-accumulator style loop to reduce loop iterations. Performance (x86_64, Intel Core i7-10700 @ 2.90GHz, avg over 1000 runs, KUnit): Encode: 64B ~90ns -> ~32ns (~2.8x) 1KB ~1332ns -> ~510ns (~2.6x) Decode: 64B ~1530ns -> ~35ns (~43.7x) 1KB ~27726ns -> ~530ns (~52.3x) [akpm@linux-foundation.org: remove u32 casts, per David and Guan-Chun] Link: https://lkml.kernel.org/r/20251114060132.89279-1-409411716@gms.tku.edu.tw Co-developed-by: Kuan-Wei Chiu Signed-off-by: Kuan-Wei Chiu Co-developed-by: Yu-Sheng Huang Signed-off-by: Yu-Sheng Huang Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Reviewed-by: David Laight Cc: Christoph Hellwig Cc: Eric Biggers Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Keith Busch Cc: Sagi Grimberg Cc: "Theodore Y. Ts'o" Cc: Viacheslav Dubeyko Cc: Xiubo Li Signed-off-by: Andrew Morton --- lib/base64.c | 109 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 41 deletions(-) diff --git a/lib/base64.c b/lib/base64.c index 2d0b775def64..41961a444028 100644 --- a/lib/base64.c +++ b/lib/base64.c @@ -80,28 +80,38 @@ static const s8 base64_rev_maps[][256] = { int base64_encode(const u8 *src, int srclen, char *dst, bool padding, enum base64_variant variant) { u32 ac = 0; - int bits = 0; - int i; char *cp = dst; const char *base64_table = base64_tables[variant]; - for (i = 0; i < srclen; i++) { - ac = (ac << 8) | src[i]; - bits += 8; - do { - bits -= 6; - *cp++ = base64_table[(ac >> bits) & 0x3f]; - } while (bits >= 6); + while (srclen >= 3) { + ac = src[0] << 16 | src[1] << 8 | src[2]; + *cp++ = base64_table[ac >> 18]; + *cp++ = base64_table[(ac >> 12) & 0x3f]; + *cp++ = base64_table[(ac >> 6) & 0x3f]; + *cp++ = base64_table[ac & 0x3f]; + + src += 3; + srclen -= 3; } - if (bits) { - *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; - bits -= 6; - } - if (padding) { - while (bits < 0) { + + switch (srclen) { + case 2: + ac = src[0] << 16 | src[1] << 8; + *cp++ = base64_table[ac >> 18]; + *cp++ = base64_table[(ac >> 12) & 0x3f]; + *cp++ = base64_table[(ac >> 6) & 0x3f]; + if (padding) + *cp++ = '='; + break; + case 1: + ac = src[0] << 16; + *cp++ = base64_table[ac >> 18]; + *cp++ = base64_table[(ac >> 12) & 0x3f]; + if (padding) { + *cp++ = '='; *cp++ = '='; - bits += 2; } + break; } return cp - dst; } @@ -117,41 +127,58 @@ EXPORT_SYMBOL_GPL(base64_encode); * * Decodes a string using the selected Base64 variant. * - * This implementation hasn't been optimized for performance. - * * Return: the length of the resulting decoded binary data in bytes, * or -1 if the string isn't a valid Base64 string. */ int base64_decode(const char *src, int srclen, u8 *dst, bool padding, enum base64_variant variant) { - u32 ac = 0; - int bits = 0; - int i; u8 *bp = dst; - s8 ch; + s8 input[4]; + s32 val; + const u8 *s = (const u8 *)src; + const s8 *base64_rev_tables = base64_rev_maps[variant]; - for (i = 0; i < srclen; i++) { - if (padding) { - if (src[i] == '=') { - ac = (ac << 6); - bits += 6; - if (bits >= 8) - bits -= 8; - continue; - } - } - ch = base64_rev_maps[variant][(u8)src[i]]; - if (ch == -1) - return -1; - ac = (ac << 6) | ch; - bits += 6; - if (bits >= 8) { - bits -= 8; - *bp++ = (u8)(ac >> bits); + while (srclen >= 4) { + input[0] = base64_rev_tables[s[0]]; + input[1] = base64_rev_tables[s[1]]; + input[2] = base64_rev_tables[s[2]]; + input[3] = base64_rev_tables[s[3]]; + + val = input[0] << 18 | input[1] << 12 | input[2] << 6 | input[3]; + + if (unlikely(val < 0)) { + if (!padding || srclen != 4 || s[3] != '=') + return -1; + padding = 0; + srclen = s[2] == '=' ? 2 : 3; + break; } + + *bp++ = val >> 16; + *bp++ = val >> 8; + *bp++ = val; + + s += 4; + srclen -= 4; } - if (ac & ((1 << bits) - 1)) + + if (likely(!srclen)) + return bp - dst; + if (padding || srclen == 1) return -1; + + val = (base64_rev_tables[s[0]] << 12) | (base64_rev_tables[s[1]] << 6); + *bp++ = val >> 10; + + if (srclen == 2) { + if (val & 0x800003ff) + return -1; + } else { + val |= base64_rev_tables[s[2]]; + if (val & 0x80000003) + return -1; + *bp++ = val >> 2; + } return bp - dst; } EXPORT_SYMBOL_GPL(base64_decode); From 8b365c4f5be9e979bb991a52a0cb4b1e4680c8bd Mon Sep 17 00:00:00 2001 From: Guan-Chun Wu <409411716@gms.tku.edu.tw> Date: Fri, 14 Nov 2025 14:01:57 +0800 Subject: [PATCH 072/139] lib: add KUnit tests for base64 encoding/decoding Add a KUnit test suite to validate the base64 helpers. The tests cover both encoding and decoding, including padded and unpadded forms as defined by RFC 4648 (standard base64), and add negative cases for malformed inputs and padding errors. The test suite also validates other variants (URLSAFE, IMAP) to ensure their correctness. In addition to functional checks, the suite includes simple microbenchmarks which report average encode/decode latency for small (64B) and larger (1KB) inputs. These numbers are informational only and do not gate the tests. Kconfig (BASE64_KUNIT) and lib/tests/Makefile are updated accordingly. Sample KUnit output: KTAP version 1 # Subtest: base64 # module: base64_kunit 1..4 # base64_performance_tests: [64B] encode run : 32ns # base64_performance_tests: [64B] decode run : 35ns # base64_performance_tests: [1KB] encode run : 510ns # base64_performance_tests: [1KB] decode run : 530ns ok 1 base64_performance_tests ok 2 base64_std_encode_tests ok 3 base64_std_decode_tests ok 4 base64_variant_tests # base64: pass:4 fail:0 skip:0 total:4 # Totals: pass:4 fail:0 skip:0 total:4 Link: https://lkml.kernel.org/r/20251114060157.89507-1-409411716@gms.tku.edu.tw Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Reviewed-by: Kuan-Wei Chiu Cc: Christoph Hellwig Cc: David Laight Cc: Eric Biggers Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Keith Busch Cc: Sagi Grimberg Cc: "Theodore Y. Ts'o" Cc: Viacheslav Dubeyko Cc: Xiubo Li Cc: Yu-Sheng Huang Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 19 ++- lib/tests/Makefile | 1 + lib/tests/base64_kunit.c | 294 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 lib/tests/base64_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9a087826498a..bd3bb7a0c801 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2815,8 +2815,25 @@ config CMDLINE_KUNIT_TEST If unsure, say N. +config BASE64_KUNIT + tristate "KUnit test for base64 decoding and encoding" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds the base64 unit tests. + + The tests cover the encoding and decoding logic of Base64 functions + in the kernel. + In addition to correctness checks, simple performance benchmarks + for both encoding and decoding are also included. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config BITS_TEST - tristate "KUnit test for bits.h" if !KUNIT_ALL_TESTS + tristate "KUnit test for bit functions and macros" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS help diff --git a/lib/tests/Makefile b/lib/tests/Makefile index f7460831cfdd..601dba4b7d96 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -4,6 +4,7 @@ # KUnit tests CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) +obj-$(CONFIG_BASE64_KUNIT) += base64_kunit.o obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_BITS_TEST) += test_bits.o obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o diff --git a/lib/tests/base64_kunit.c b/lib/tests/base64_kunit.c new file mode 100644 index 000000000000..f7252070c359 --- /dev/null +++ b/lib/tests/base64_kunit.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * base64_kunit_test.c - KUnit tests for base64 encoding and decoding functions + * + * Copyright (c) 2025, Guan-Chun Wu <409411716@gms.tku.edu.tw> + */ + +#include +#include + +/* ---------- Benchmark helpers ---------- */ +static u64 bench_encode_ns(const u8 *data, int len, char *dst, int reps, + enum base64_variant variant) +{ + u64 t0, t1; + + t0 = ktime_get_ns(); + for (int i = 0; i < reps; i++) + base64_encode(data, len, dst, true, variant); + t1 = ktime_get_ns(); + + return div64_u64(t1 - t0, (u64)reps); +} + +static u64 bench_decode_ns(const char *data, int len, u8 *dst, int reps, + enum base64_variant variant) +{ + u64 t0, t1; + + t0 = ktime_get_ns(); + for (int i = 0; i < reps; i++) + base64_decode(data, len, dst, true, variant); + t1 = ktime_get_ns(); + + return div64_u64(t1 - t0, (u64)reps); +} + +static void run_perf_and_check(struct kunit *test, const char *label, int size, + enum base64_variant variant) +{ + const int reps = 1000; + size_t outlen = DIV_ROUND_UP(size, 3) * 4; + u8 *in = kmalloc(size, GFP_KERNEL); + char *enc = kmalloc(outlen, GFP_KERNEL); + u8 *decoded = kmalloc(size, GFP_KERNEL); + + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, in); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, enc); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, decoded); + + get_random_bytes(in, size); + int enc_len = base64_encode(in, size, enc, true, variant); + int dec_len = base64_decode(enc, enc_len, decoded, true, variant); + + /* correctness sanity check */ + KUNIT_EXPECT_EQ(test, dec_len, size); + KUNIT_EXPECT_MEMEQ(test, decoded, in, size); + + /* benchmark encode */ + + u64 t1 = bench_encode_ns(in, size, enc, reps, variant); + + kunit_info(test, "[%s] encode run : %lluns", label, t1); + + u64 t2 = bench_decode_ns(enc, enc_len, decoded, reps, variant); + + kunit_info(test, "[%s] decode run : %lluns", label, t2); + + kfree(in); + kfree(enc); + kfree(decoded); +} + +static void base64_performance_tests(struct kunit *test) +{ + /* run on STD variant only */ + run_perf_and_check(test, "64B", 64, BASE64_STD); + run_perf_and_check(test, "1KB", 1024, BASE64_STD); +} + +/* ---------- Helpers for encode ---------- */ +static void expect_encode_ok(struct kunit *test, const u8 *src, int srclen, + const char *expected, bool padding, + enum base64_variant variant) +{ + char buf[128]; + int encoded_len = base64_encode(src, srclen, buf, padding, variant); + + buf[encoded_len] = '\0'; + + KUNIT_EXPECT_EQ(test, encoded_len, strlen(expected)); + KUNIT_EXPECT_STREQ(test, buf, expected); +} + +/* ---------- Helpers for decode ---------- */ +static void expect_decode_ok(struct kunit *test, const char *src, + const u8 *expected, int expected_len, bool padding, + enum base64_variant variant) +{ + u8 buf[128]; + int decoded_len = base64_decode(src, strlen(src), buf, padding, variant); + + KUNIT_EXPECT_EQ(test, decoded_len, expected_len); + KUNIT_EXPECT_MEMEQ(test, buf, expected, expected_len); +} + +static void expect_decode_err(struct kunit *test, const char *src, + int srclen, bool padding, + enum base64_variant variant) +{ + u8 buf[64]; + int decoded_len = base64_decode(src, srclen, buf, padding, variant); + + KUNIT_EXPECT_EQ(test, decoded_len, -1); +} + +/* ---------- Encode Tests ---------- */ +static void base64_std_encode_tests(struct kunit *test) +{ + /* With padding */ + expect_encode_ok(test, (const u8 *)"", 0, "", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"f", 1, "Zg==", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"fo", 2, "Zm8=", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"foo", 3, "Zm9v", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"foob", 4, "Zm9vYg==", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"fooba", 5, "Zm9vYmE=", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"foobar", 6, "Zm9vYmFy", true, BASE64_STD); + + /* Extra cases with padding */ + expect_encode_ok(test, (const u8 *)"Hello, world!", 13, "SGVsbG8sIHdvcmxkIQ==", + true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"ABCDEFGHIJKLMNOPQRSTUVWXYZ", 26, + "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"abcdefghijklmnopqrstuvwxyz", 26, + "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXo=", true, BASE64_STD); + expect_encode_ok(test, (const u8 *)"0123456789+/", 12, "MDEyMzQ1Njc4OSsv", + true, BASE64_STD); + + /* Without padding */ + expect_encode_ok(test, (const u8 *)"", 0, "", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"f", 1, "Zg", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"fo", 2, "Zm8", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"foo", 3, "Zm9v", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"foob", 4, "Zm9vYg", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"fooba", 5, "Zm9vYmE", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"foobar", 6, "Zm9vYmFy", false, BASE64_STD); + + /* Extra cases without padding */ + expect_encode_ok(test, (const u8 *)"Hello, world!", 13, "SGVsbG8sIHdvcmxkIQ", + false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"ABCDEFGHIJKLMNOPQRSTUVWXYZ", 26, + "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"abcdefghijklmnopqrstuvwxyz", 26, + "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXo", false, BASE64_STD); + expect_encode_ok(test, (const u8 *)"0123456789+/", 12, "MDEyMzQ1Njc4OSsv", + false, BASE64_STD); +} + +/* ---------- Decode Tests ---------- */ +static void base64_std_decode_tests(struct kunit *test) +{ + /* -------- With padding --------*/ + expect_decode_ok(test, "", (const u8 *)"", 0, true, BASE64_STD); + expect_decode_ok(test, "Zg==", (const u8 *)"f", 1, true, BASE64_STD); + expect_decode_ok(test, "Zm8=", (const u8 *)"fo", 2, true, BASE64_STD); + expect_decode_ok(test, "Zm9v", (const u8 *)"foo", 3, true, BASE64_STD); + expect_decode_ok(test, "Zm9vYg==", (const u8 *)"foob", 4, true, BASE64_STD); + expect_decode_ok(test, "Zm9vYmE=", (const u8 *)"fooba", 5, true, BASE64_STD); + expect_decode_ok(test, "Zm9vYmFy", (const u8 *)"foobar", 6, true, BASE64_STD); + expect_decode_ok(test, "SGVsbG8sIHdvcmxkIQ==", (const u8 *)"Hello, world!", 13, + true, BASE64_STD); + expect_decode_ok(test, "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=", + (const u8 *)"ABCDEFGHIJKLMNOPQRSTUVWXYZ", 26, true, BASE64_STD); + expect_decode_ok(test, "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXo=", + (const u8 *)"abcdefghijklmnopqrstuvwxyz", 26, true, BASE64_STD); + + /* Error cases */ + expect_decode_err(test, "Zg=!", 4, true, BASE64_STD); + expect_decode_err(test, "Zm$=", 4, true, BASE64_STD); + expect_decode_err(test, "Z===", 4, true, BASE64_STD); + expect_decode_err(test, "Zg", 2, true, BASE64_STD); + expect_decode_err(test, "Zm9v====", 8, true, BASE64_STD); + expect_decode_err(test, "Zm==A", 5, true, BASE64_STD); + + { + char with_nul[4] = { 'Z', 'g', '\0', '=' }; + + expect_decode_err(test, with_nul, 4, true, BASE64_STD); + } + + /* -------- Without padding --------*/ + expect_decode_ok(test, "", (const u8 *)"", 0, false, BASE64_STD); + expect_decode_ok(test, "Zg", (const u8 *)"f", 1, false, BASE64_STD); + expect_decode_ok(test, "Zm8", (const u8 *)"fo", 2, false, BASE64_STD); + expect_decode_ok(test, "Zm9v", (const u8 *)"foo", 3, false, BASE64_STD); + expect_decode_ok(test, "Zm9vYg", (const u8 *)"foob", 4, false, BASE64_STD); + expect_decode_ok(test, "Zm9vYmE", (const u8 *)"fooba", 5, false, BASE64_STD); + expect_decode_ok(test, "Zm9vYmFy", (const u8 *)"foobar", 6, false, BASE64_STD); + expect_decode_ok(test, "TWFu", (const u8 *)"Man", 3, false, BASE64_STD); + expect_decode_ok(test, "SGVsbG8sIHdvcmxkIQ", (const u8 *)"Hello, world!", 13, + false, BASE64_STD); + expect_decode_ok(test, "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo", + (const u8 *)"ABCDEFGHIJKLMNOPQRSTUVWXYZ", 26, false, BASE64_STD); + expect_decode_ok(test, "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXo", + (const u8 *)"abcdefghijklmnopqrstuvwxyz", 26, false, BASE64_STD); + expect_decode_ok(test, "MDEyMzQ1Njc4OSsv", (const u8 *)"0123456789+/", 12, + false, BASE64_STD); + + /* Error cases */ + expect_decode_err(test, "Zg=!", 4, false, BASE64_STD); + expect_decode_err(test, "Zm$=", 4, false, BASE64_STD); + expect_decode_err(test, "Z===", 4, false, BASE64_STD); + expect_decode_err(test, "Zg=", 3, false, BASE64_STD); + expect_decode_err(test, "Zm9v====", 8, false, BASE64_STD); + expect_decode_err(test, "Zm==v", 4, false, BASE64_STD); + + { + char with_nul[4] = { 'Z', 'g', '\0', '=' }; + + expect_decode_err(test, with_nul, 4, false, BASE64_STD); + } +} + +/* ---------- Variant tests (URLSAFE / IMAP) ---------- */ +static void base64_variant_tests(struct kunit *test) +{ + const u8 sample1[] = { 0x00, 0xfb, 0xff, 0x7f, 0x80 }; + char std_buf[128], url_buf[128], imap_buf[128]; + u8 back[128]; + int n_std, n_url, n_imap, m; + int i; + + n_std = base64_encode(sample1, sizeof(sample1), std_buf, false, BASE64_STD); + n_url = base64_encode(sample1, sizeof(sample1), url_buf, false, BASE64_URLSAFE); + std_buf[n_std] = '\0'; + url_buf[n_url] = '\0'; + + for (i = 0; i < n_std; i++) { + if (std_buf[i] == '+') + std_buf[i] = '-'; + else if (std_buf[i] == '/') + std_buf[i] = '_'; + } + KUNIT_EXPECT_STREQ(test, std_buf, url_buf); + + m = base64_decode(url_buf, n_url, back, false, BASE64_URLSAFE); + KUNIT_EXPECT_EQ(test, m, (int)sizeof(sample1)); + KUNIT_EXPECT_MEMEQ(test, back, sample1, sizeof(sample1)); + + n_std = base64_encode(sample1, sizeof(sample1), std_buf, false, BASE64_STD); + n_imap = base64_encode(sample1, sizeof(sample1), imap_buf, false, BASE64_IMAP); + std_buf[n_std] = '\0'; + imap_buf[n_imap] = '\0'; + + for (i = 0; i < n_std; i++) + if (std_buf[i] == '/') + std_buf[i] = ','; + KUNIT_EXPECT_STREQ(test, std_buf, imap_buf); + + m = base64_decode(imap_buf, n_imap, back, false, BASE64_IMAP); + KUNIT_EXPECT_EQ(test, m, (int)sizeof(sample1)); + KUNIT_EXPECT_MEMEQ(test, back, sample1, sizeof(sample1)); + + { + const char *bad = "Zg=="; + u8 tmp[8]; + + m = base64_decode(bad, strlen(bad), tmp, false, BASE64_URLSAFE); + KUNIT_EXPECT_EQ(test, m, -1); + + m = base64_decode(bad, strlen(bad), tmp, false, BASE64_IMAP); + KUNIT_EXPECT_EQ(test, m, -1); + } +} + +/* ---------- Test registration ---------- */ +static struct kunit_case base64_test_cases[] = { + KUNIT_CASE(base64_performance_tests), + KUNIT_CASE(base64_std_encode_tests), + KUNIT_CASE(base64_std_decode_tests), + KUNIT_CASE(base64_variant_tests), + {} +}; + +static struct kunit_suite base64_test_suite = { + .name = "base64", + .test_cases = base64_test_cases, +}; + +kunit_test_suite(base64_test_suite); + +MODULE_AUTHOR("Guan-Chun Wu <409411716@gms.tku.edu.tw>"); +MODULE_DESCRIPTION("KUnit tests for Base64 encoding/decoding, including performance checks"); +MODULE_LICENSE("GPL"); From 7794510e2021a29095721e59f3d249d8b4242fb4 Mon Sep 17 00:00:00 2001 From: Guan-Chun Wu <409411716@gms.tku.edu.tw> Date: Fri, 14 Nov 2025 14:02:21 +0800 Subject: [PATCH 073/139] fscrypt: replace local base64url helpers with lib/base64 Replace the base64url encoding and decoding functions in fscrypt with the generic base64_encode() and base64_decode() helpers from lib/base64. This removes the custom implementation in fscrypt, reduces code duplication, and relies on the shared Base64 implementation in lib. The helpers preserve RFC 4648-compliant URL-safe Base64 encoding without padding, so there are no functional changes. This change also improves performance: encoding is about 2.7x faster and decoding achieves 43-52x speedups compared to the previous implementation. Link: https://lkml.kernel.org/r/20251114060221.89734-1-409411716@gms.tku.edu.tw Reviewed-by: Kuan-Wei Chiu Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Cc: Christoph Hellwig Cc: David Laight Cc: Eric Biggers Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Keith Busch Cc: Sagi Grimberg Cc: "Theodore Y. Ts'o" Cc: Viacheslav Dubeyko Cc: Xiubo Li Cc: Yu-Sheng Huang Signed-off-by: Andrew Morton --- fs/crypto/fname.c | 89 ++++------------------------------------------- 1 file changed, 6 insertions(+), 83 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 8e4c213d418b..a9a4432d12ba 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "fscrypt_private.h" @@ -71,7 +72,7 @@ struct fscrypt_nokey_name { /* Encoded size of max-size no-key name */ #define FSCRYPT_NOKEY_NAME_MAX_ENCODED \ - FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX) + BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { @@ -162,84 +163,6 @@ static int fname_decrypt(const struct inode *inode, return 0; } -static const char base64url_table[65] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; - -#define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) - -/** - * fscrypt_base64url_encode() - base64url-encode some binary data - * @src: the binary data to encode - * @srclen: the length of @src in bytes - * @dst: (output) the base64url-encoded string. Not NUL-terminated. - * - * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL - * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, - * as it's unneeded and not required by the RFC. base64url is used instead of - * base64 to avoid the '/' character, which isn't allowed in filenames. - * - * Return: the length of the resulting base64url-encoded string in bytes. - * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen). - */ -static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst) -{ - u32 ac = 0; - int bits = 0; - int i; - char *cp = dst; - - for (i = 0; i < srclen; i++) { - ac = (ac << 8) | src[i]; - bits += 8; - do { - bits -= 6; - *cp++ = base64url_table[(ac >> bits) & 0x3f]; - } while (bits >= 6); - } - if (bits) - *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; - return cp - dst; -} - -/** - * fscrypt_base64url_decode() - base64url-decode a string - * @src: the string to decode. Doesn't need to be NUL-terminated. - * @srclen: the length of @src in bytes - * @dst: (output) the decoded binary data - * - * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with - * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't - * accepted, nor are non-encoding characters such as whitespace. - * - * This implementation hasn't been optimized for performance. - * - * Return: the length of the resulting decoded binary data in bytes, - * or -1 if the string isn't a valid base64url string. - */ -static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) -{ - u32 ac = 0; - int bits = 0; - int i; - u8 *bp = dst; - - for (i = 0; i < srclen; i++) { - const char *p = strchr(base64url_table, src[i]); - - if (p == NULL || src[i] == 0) - return -1; - ac = (ac << 6) | (p - base64url_table); - bits += 6; - if (bits >= 8) { - bits -= 8; - *bp++ = (u8)(ac >> bits); - } - } - if (ac & ((1 << bits) - 1)) - return -1; - return bp - dst; -} - bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) @@ -387,8 +310,8 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } - oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size, - oname->name); + oname->len = base64_encode((const u8 *)&nokey_name, size, + oname->name, false, BASE64_URLSAFE); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -467,8 +390,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = fscrypt_base64url_decode(iname->name, iname->len, - fname->crypto_buf.name); + ret = base64_decode(iname->name, iname->len, + fname->crypto_buf.name, false, BASE64_URLSAFE); if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || (ret > offsetof(struct fscrypt_nokey_name, sha256) && ret != FSCRYPT_NOKEY_NAME_MAX)) { From b1b72ac25f89125d91ef3abd257c3b88ec169962 Mon Sep 17 00:00:00 2001 From: Guan-Chun Wu <409411716@gms.tku.edu.tw> Date: Fri, 14 Nov 2025 14:02:40 +0800 Subject: [PATCH 074/139] ceph: replace local base64 helpers with lib/base64 Remove the ceph_base64_encode() and ceph_base64_decode() functions and replace their usage with the generic base64_encode() and base64_decode() helpers from lib/base64. This eliminates the custom implementation in Ceph, reduces code duplication, and relies on the shared Base64 code in lib. The helpers preserve RFC 3501-compliant Base64 encoding without padding, so there are no functional changes. This change also improves performance: encoding is about 2.7x faster and decoding achieves 43-52x speedups compared to the previous local implementation. Link: https://lkml.kernel.org/r/20251114060240.89965-1-409411716@gms.tku.edu.tw Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Reviewed-by: Kuan-Wei Chiu Reviewed-by: Viacheslav Dubeyko Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Xiubo Li Cc: Ilya Dryomov Cc: Eric Biggers Cc: "Theodore Y. Ts'o" Cc: Jaegeuk Kim Cc: David Laight Cc: Yu-Sheng Huang Signed-off-by: Andrew Morton --- fs/ceph/crypto.c | 60 ++++-------------------------------------------- fs/ceph/crypto.h | 6 +---- fs/ceph/dir.c | 5 ++-- fs/ceph/inode.c | 2 +- 4 files changed, 9 insertions(+), 64 deletions(-) diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index 7026e794813c..b6016dcffbb6 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -15,59 +15,6 @@ #include "mds_client.h" #include "crypto.h" -/* - * The base64url encoding used by fscrypt includes the '_' character, which may - * cause problems in snapshot names (which can not start with '_'). Thus, we - * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead, - * which replaces '-' and '_' by '+' and ','. - */ -static const char base64_table[65] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; - -int ceph_base64_encode(const u8 *src, int srclen, char *dst) -{ - u32 ac = 0; - int bits = 0; - int i; - char *cp = dst; - - for (i = 0; i < srclen; i++) { - ac = (ac << 8) | src[i]; - bits += 8; - do { - bits -= 6; - *cp++ = base64_table[(ac >> bits) & 0x3f]; - } while (bits >= 6); - } - if (bits) - *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; - return cp - dst; -} - -int ceph_base64_decode(const char *src, int srclen, u8 *dst) -{ - u32 ac = 0; - int bits = 0; - int i; - u8 *bp = dst; - - for (i = 0; i < srclen; i++) { - const char *p = strchr(base64_table, src[i]); - - if (p == NULL || src[i] == 0) - return -1; - ac = (ac << 6) | (p - base64_table); - bits += 6; - if (bits >= 8) { - bits -= 8; - *bp++ = (u8)(ac >> bits); - } - } - if (ac & ((1 << bits) - 1)) - return -1; - return bp - dst; -} - static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -318,7 +265,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) } /* base64 encode the encrypted name */ - elen = ceph_base64_encode(cryptbuf, len, p); + elen = base64_encode(cryptbuf, len, p, false, BASE64_IMAP); doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p); /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ @@ -412,7 +359,8 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, tname = &_tname; } - declen = ceph_base64_decode(name, name_len, tname->name); + declen = base64_decode(name, name_len, + tname->name, false, BASE64_IMAP); if (declen <= 0) { ret = -EIO; goto out; @@ -426,7 +374,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname); if (!ret && (dir != fname->dir)) { - char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)]; + char tmp_buf[BASE64_CHARS(NAME_MAX)]; name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", oname->len, oname->name, dir->i_ino); diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 23612b2e9837..b748e2060bc9 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -8,6 +8,7 @@ #include #include +#include #define CEPH_FSCRYPT_BLOCK_SHIFT 12 #define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT) @@ -89,11 +90,6 @@ static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) */ #define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE) -#define CEPH_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) - -int ceph_base64_encode(const u8 *src, int srclen, char *dst); -int ceph_base64_decode(const char *src, int srclen, u8 *dst); - void ceph_fscrypt_set_ops(struct super_block *sb); void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d18c0eaef9b7..0fa7c7777242 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -998,13 +998,14 @@ static int prep_encrypted_symlink_target(struct ceph_mds_request *req, if (err) goto out; - req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL); + req->r_path2 = kmalloc(BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL); if (!req->r_path2) { err = -ENOMEM; goto out; } - len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2); + len = base64_encode(osd_link.name, osd_link.len, + req->r_path2, false, BASE64_IMAP); req->r_path2[len] = '\0'; out: fscrypt_fname_free_buffer(&osd_link); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a6e260d9e420..b691343cb7f1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -958,7 +958,7 @@ static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, if (!sym) return -ENOMEM; - declen = ceph_base64_decode(encsym, enclen, sym); + declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP); if (declen < 0) { pr_err_client(cl, "can't decode symlink (%d). Content: %.*s\n", From fdd76c8d6327e616ee61ddd00db16753d722168c Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Mon, 17 Nov 2025 09:21:53 +0530 Subject: [PATCH 075/139] Documentation/ABI: add kexec and kdump sysfs interface Add an ABI document for following kexec and kdump sysfs interface: - /sys/kernel/kexec_loaded - /sys/kernel/kexec_crash_loaded - /sys/kernel/kexec_crash_size - /sys/kernel/crash_elfcorehdr_size Link: https://lkml.kernel.org/r/20251117035153.1199665-1-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Cc: Aditya Gupta Cc: Baoquan he Cc: Dave Young Cc: Hari Bathini Cc: Jiri Bohac Cc: Madhavan Srinivasan Cc: Mahesh J Salgaonkar Cc: Pingfan Liu Cc: Ritesh Harjani (IBM) Cc: Shivang Upadhyay Cc: Vivek Goyal Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-kexec-kdump | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-kexec-kdump diff --git a/Documentation/ABI/testing/sysfs-kernel-kexec-kdump b/Documentation/ABI/testing/sysfs-kernel-kexec-kdump new file mode 100644 index 000000000000..96b24565b68e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-kexec-kdump @@ -0,0 +1,43 @@ +What: /sys/kernel/kexec_loaded +Date: Jun 2006 +Contact: kexec@lists.infradead.org +Description: read only + Indicates whether a new kernel image has been loaded + into memory using the kexec system call. It shows 1 if + a kexec image is present and ready to boot, or 0 if none + is loaded. +User: kexec tools, kdump service + +What: /sys/kernel/kexec_crash_loaded +Date: Jun 2006 +Contact: kexec@lists.infradead.org +Description: read only + Indicates whether a crash (kdump) kernel is currently + loaded into memory. It shows 1 if a crash kernel has been + successfully loaded for panic handling, or 0 if no crash + kernel is present. +User: Kexec tools, Kdump service + +What: /sys/kernel/kexec_crash_size +Date: Dec 2009 +Contact: kexec@lists.infradead.org +Description: read/write + Shows the amount of memory reserved for loading the crash + (kdump) kernel. It reports the size, in bytes, of the + crash kernel area defined by the crashkernel= parameter. + This interface also allows reducing the crashkernel + reservation by writing a smaller value, and the reclaimed + space is added back to the system RAM. +User: Kdump service + +What: /sys/kernel/crash_elfcorehdr_size +Date: Aug 2023 +Contact: kexec@lists.infradead.org +Description: read only + Indicates the preferred size of the memory buffer for the + ELF core header used by the crash (kdump) kernel. It defines + how much space is needed to hold metadata about the crashed + system, including CPU and memory information. This information + is used by the user space utility kexec to support updating the + in-kernel kdump image during hotplug operations. +User: Kexec tools From aa0145563ce26a5f5a1154e9f26a2f8c21eee2ca Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 18 Nov 2025 12:40:23 +0530 Subject: [PATCH 076/139] crash: export crashkernel CMA reservation to userspace Add a sysfs entry /sys/kernel/kexec_crash_cma_ranges to expose all CMA crashkernel ranges. This allows userspace tools configuring kdump to determine how much memory is reserved for crashkernel. If CMA is used, tools can warn users when attempting to capture user pages with CMA reservation. The new sysfs hold the CMA ranges in below format: cat /sys/kernel/kexec_crash_cma_ranges 100000000-10c7fffff The reason for not including Crash CMA Ranges in /proc/iomem is to avoid conflicts. It has been observed that contiguous memory ranges are sometimes shown as two separate System RAM entries in /proc/iomem. If a CMA range overlaps two System RAM ranges, adding crashk_res to /proc/iomem can create a conflict. Reference [1] describes one such instance on the PowerPC architecture. Link: https://lkml.kernel.org/r/20251118071023.1673329-1-sourabhjain@linux.ibm.com Link: https://lore.kernel.org/all/20251016142831.144515-1-sourabhjain@linux.ibm.com/ [1] Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Aditya Gupta Cc: Dave Young Cc: Hari Bathini Cc: Jiri Bohac Cc: Madhavan Srinivasan Cc: Mahesh J Salgaonkar Cc: Pingfan Liu Cc: Ritesh Harjani (IBM) Cc: Shivang Upadhyay Cc: Vivek Goyal Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-kexec-kdump | 10 +++++++++ kernel/ksysfs.c | 21 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-kexec-kdump b/Documentation/ABI/testing/sysfs-kernel-kexec-kdump index 96b24565b68e..f6089e38de5f 100644 --- a/Documentation/ABI/testing/sysfs-kernel-kexec-kdump +++ b/Documentation/ABI/testing/sysfs-kernel-kexec-kdump @@ -41,3 +41,13 @@ Description: read only is used by the user space utility kexec to support updating the in-kernel kdump image during hotplug operations. User: Kexec tools + +What: /sys/kernel/kexec_crash_cma_ranges +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: read only + Provides information about the memory ranges reserved from + the Contiguous Memory Allocator (CMA) area that are allocated + to the crash (kdump) kernel. It lists the start and end physical + addresses of CMA regions assigned for crashkernel use. +User: kdump service diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index eefb67d9883c..0ff2179bc603 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -135,6 +135,24 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_crash_loaded); +#ifdef CONFIG_CRASH_RESERVE +static ssize_t kexec_crash_cma_ranges_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + ssize_t len = 0; + int i; + + for (i = 0; i < crashk_cma_cnt; ++i) { + len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", + crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + } + return len; +} +KERNEL_ATTR_RO(kexec_crash_cma_ranges); +#endif /* CONFIG_CRASH_RESERVE */ + static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -260,6 +278,9 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, +#ifdef CONFIG_CRASH_RESERVE + &kexec_crash_cma_ranges_attr.attr, +#endif #endif #endif #ifdef CONFIG_VMCORE_INFO From 9031b852c97f1db52180878aed66ca08946eca93 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 18 Nov 2025 17:32:50 +0000 Subject: [PATCH 077/139] uaccess: gate _copy_[to|from]_user on !INLINE_COPY_FROM_USER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These methods only exist when INLINE_COPY_FROM_USER is disabled, so update the header file to reflect that. This fixes the following error on builds that enable both RUST and INLINE_COPY_FROM_USER. ERROR: modpost: "_copy_from_user" [samples/rust/rust_misc_device.ko] undefined! ERROR: modpost: "_copy_to_user" [samples/rust/rust_misc_device.ko] undefined! This error is triggered because when a method is available both as a rust_helper_* and normal method, Rust will call the normal method. [akpm@linux-foundation.org: s/INLINE_COPY_FROM_USER/INLINE_COPY_TO_USER/, per Alice] Link: https://lkml.kernel.org/r/20251118173250.2821388-1-aliceryhl@google.com Fixes: d99dc586ca7c ("uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST") Signed-off-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Arnd Bergmann Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: John Hubbard Cc: Miguel Ojeda Cc: Trevor Gross Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 01cbd7dd0ba3..5594012160da 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -181,8 +181,10 @@ fail: memset(to + (n - res), 0, res); return res; } +#ifndef INLINE_COPY_FROM_USER extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); +#endif static inline __must_check unsigned long _inline_copy_to_user(void __user *to, const void *from, unsigned long n) @@ -196,8 +198,10 @@ _inline_copy_to_user(void __user *to, const void *from, unsigned long n) } return n; } +#ifndef INLINE_COPY_TO_USER extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); +#endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) From 2fe869ecbd06eaa37a77cd58c583f4231ce7de04 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Wed, 19 Nov 2025 19:08:22 +0800 Subject: [PATCH 078/139] MAINTAINERS: add Petr as a reviewer of hung task detector Petr has been actively reviewing hung task detector patches lately. It's always good to have a fresh pair of eyes, so let's make it official. I checked with him, and he's happy to be added. Link: https://lkml.kernel.org/r/20251119110822.46566-1-lance.yang@linux.dev Signed-off-by: Lance Yang Acked-by: Petr Mladek Cc: "Masami Hiramatsu (Google)" Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a56bc68ab369..0ac71a8b8b4a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11666,6 +11666,7 @@ HUNG TASK DETECTOR M: Andrew Morton R: Lance Yang R: Masami Hiramatsu +R: Petr Mladek L: linux-kernel@vger.kernel.org S: Maintained F: include/linux/hung_task.h From ff713698bad2e7d052960cf182fa1ab465564dfd Mon Sep 17 00:00:00 2001 From: Chia-Liang Wang Date: Wed, 19 Nov 2025 18:11:44 +0800 Subject: [PATCH 079/139] lib: ratelimit: fix spelling mistake 'seperately' Corrects a spelling mistake in a comment in ratelimit.c where 'seperately' was used instead of 'separately'. Link: https://lkml.kernel.org/r/20251119101144.3175-1-a0979625527@icloud.com Signed-off-by: Chia-Liang Wang Signed-off-by: Andrew Morton --- lib/ratelimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 859c251b23ce..e2d65d3b1c35 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -27,7 +27,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) { /* Paired with WRITE_ONCE() in .proc_handler(). - * Changing two values seperately could be inconsistent + * Changing two values separately could be inconsistent * and some message could be lost. (See: net_ratelimit_state). */ int interval = READ_ONCE(rs->interval); From 4022ba20050e8e2bb189155e0665b60953d7552c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 18 Nov 2025 19:53:44 +0100 Subject: [PATCH 080/139] ocfs2: replace deprecated strcpy in ocfs2_create_xattr_block strcpy() has been deprecated [1] because it performs no bounds checking on the destination buffer, which can lead to buffer overflows. Replace it with the safer strscpy(), and copy directly into '->xb_signature' instead of using the start of the struct as the destination buffer. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy [1] Link: https://lkml.kernel.org/r/20251118185345.132411-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d70a20d29e3e..73c028f452ac 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2908,7 +2908,7 @@ static int ocfs2_create_xattr_block(struct inode *inode, /* Initialize ocfs2_xattr_block */ xblk = (struct ocfs2_xattr_block *)new_bh->b_data; memset(xblk, 0, inode->i_sb->s_blocksize); - strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + strscpy(xblk->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE); xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); From 13db54aad7442852d57bd384305bc8c10a9dcd1f Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 18 Nov 2025 19:53:45 +0100 Subject: [PATCH 081/139] ocfs2: replace deprecated strcpy with strscpy strcpy() has been deprecated [1] because it performs no bounds checking on the destination buffer, which can lead to buffer overflows. Replace it with the safer strscpy(), and copy directly into '->rf_signature' instead of using the start of the struct as the destination buffer. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy [1] Link: https://lkml.kernel.org/r/20251118185345.132411-3-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/refcounttree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 267b50e8e42e..c92e0ea85bca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -621,7 +622,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, /* Initialize ocfs2_refcount_block. */ rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(rb, 0, inode->i_sb->s_blocksize); - strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + strscpy(rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE); rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -1562,7 +1563,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, /* Initialize ocfs2_refcount_block. */ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); - strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + strscpy(new_rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); From 58b6fcd2ab34399258dc509f701d0986a8e0bcaa Mon Sep 17 00:00:00 2001 From: Ahmet Eray Karadag Date: Tue, 18 Nov 2025 03:18:34 +0300 Subject: [PATCH 082/139] ocfs2: mark inode bad upon validation failure during read A VFS cache inconsistency, potentially triggered by sequences like buffered writes followed by open(O_DIRECT), can result in an invalid on-disk inode block (e.g., bad signature). OCFS2 detects this corruption when reading the inode block via ocfs2_validate_inode_block(), logs "Invalid dinode", and often switches the filesystem to read-only mode. The VFS open(O_DIRECT) operation appears to incorrectly clear the inode's I_DIRTY flag without ensuring the dirty metadata (reflecting the earlier buffered write, e.g., an updated i_size) is flushed to disk. This leaves the in-memory VFS inode object "in limbo" with an updated size (e.g., 38639 from the write) but marked clean, while its on-disk counterpart remains stale (e.g., size 0) or invalid. Currently, the function reading the inode block (ocfs2_read_inode_block_full()) fails to call make_bad_inode() upon detecting the validation error. Because the in-memory inode is not marked bad, subsequent operations (like ftruncate) proceed erroneously. They eventually reach code (e.g., ocfs2_truncate_file()) that compares the inconsistent in-memory size (38639) against the invalid/stale on-disk size (0), leading to kernel crashes via BUG_ON. Fix this by calling make_bad_inode(inode) within the error handling path of ocfs2_read_inode_block_full() immediately after a block read or validation error occurs. This ensures VFS is properly notified about the corrupt inode at the point of detection. Marking the inode bad allows VFS to correctly fail subsequent operations targeting this inode early, preventing kernel panics caused by operating on known inconsistent inode states. Link: https://lkml.kernel.org/r/20251118001833.423470-2-eraykrdg1@gmail.com Link: https://lore.kernel.org/all/20251029225748.11361-2-eraykrdg1@gmail.com/T/ Signed-off-by: Albin Babu Varghese Signed-off-by: Ahmet Eray Karadag Reported-by: syzbot+b93b65ee321c97861072@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=b93b65ee321c97861072 Reviewed-by: Heming Zhao Co-developed-by: Albin Babu Varghese Acked-by: Joseph Qi Cc: David Hunter Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 8d6c106bcb98..613636789503 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1727,6 +1727,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_validate_inode_block); + if (rc < 0) + make_bad_inode(inode); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; From c2d2dad24503d7e2eb7cba354fcc73f95fa78d7a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Nov 2025 14:06:45 +0000 Subject: [PATCH 083/139] rbtree: inline rb_first() Patch series "rbree: inline rb_first() and rb_last()". Inline these two small helpers, heavily used in TCP and FQ packet scheduler, and in many other places. This reduces kernel text size, and brings an 1.5 % improvement on network TCP stress test. This patch (of 2): This is a very small function, inlining it saves cpu cycles by reducing register pressure and removing call/ret overhead. It also reduces vmlinux text size by 744 bytes on a typical x86_64 build. Before: size vmlinux text data bss dec hex filename 34812525 22177365 5685248 62675138 3bc58c2 vmlinux After: size vmlinux text data bss dec hex filename 34811781 22177365 5685248 62674394 3bc55da vmlinux [ojeda@kernel.org: fix rust build] Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20251114140646.3817319-1-edumazet@google.com Link: https://lkml.kernel.org/r/20251114140646.3817319-2-edumazet@google.com Signed-off-by: Eric Dumazet Signed-off-by: Miguel Ojeda Reviewed-by: Kuan-Wei Chiu Cc: Jakub Kacinski Cc: Neal Cardwell Cc: Paolo Abeni Cc: Alice Ryhl Cc: Stehen Rothwell Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 16 +++++++++++++++- lib/rbtree.c | 16 ---------------- rust/helpers/rbtree.c | 5 +++++ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 8d2ba3749866..484554900f7d 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -43,7 +43,21 @@ extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); -extern struct rb_node *rb_first(const struct rb_root *); + +/* + * This function returns the first node (in sort order) of the tree. + */ +static inline struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ diff --git a/lib/rbtree.c b/lib/rbtree.c index 5114eda6309c..b946eb4b759d 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -460,22 +460,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, } EXPORT_SYMBOL(__rb_insert_augmented); -/* - * This function returns the first node (in sort order) of the tree. - */ -struct rb_node *rb_first(const struct rb_root *root) -{ - struct rb_node *n; - - n = root->rb_node; - if (!n) - return NULL; - while (n->rb_left) - n = n->rb_left; - return n; -} -EXPORT_SYMBOL(rb_first); - struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; diff --git a/rust/helpers/rbtree.c b/rust/helpers/rbtree.c index 6d404b84a9b5..d0e452234632 100644 --- a/rust/helpers/rbtree.c +++ b/rust/helpers/rbtree.c @@ -7,3 +7,8 @@ void rust_helper_rb_link_node(struct rb_node *node, struct rb_node *parent, { rb_link_node(node, parent, rb_link); } + +struct rb_node *rust_helper_rb_first(const struct rb_root *root) +{ + return rb_first(root); +} From 94984bfed58ca129f7e259ce09973ed0b3f540a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Nov 2025 14:06:46 +0000 Subject: [PATCH 084/139] rbtree: inline rb_last() This is a very small function, inlining it saves cpu cycles in TCP by reducing register pressure and removing call/ret overhead. It also reduces vmlinux text size by 122 bytes on a typical x86_64 build. Before: size vmlinux text data bss dec hex filename 34811781 22177365 5685248 62674394 3bc55da vmlinux After: size vmlinux text data bss dec hex filename 34811659 22177365 5685248 62674272 3bc5560 vmlinux [ojeda@kernel.org: fix rust build] Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20251114140646.3817319-3-edumazet@google.com Signed-off-by: Eric Dumazet Signed-off-by: Miguel Ojeda Reviewed-by: Kuan-Wei Chiu Cc: Jakub Kacinski Cc: Neal Cardwell Cc: Paolo Abeni Cc: Alice Ryhl Cc: Stehen Rothwell Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 16 +++++++++++++++- lib/rbtree.c | 13 ------------- rust/helpers/rbtree.c | 5 +++++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 484554900f7d..4091e978aef2 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -58,7 +58,21 @@ static inline struct rb_node *rb_first(const struct rb_root *root) n = n->rb_left; return n; } -extern struct rb_node *rb_last(const struct rb_root *); + +/* + * This function returns the last node (in sort order) of the tree. + */ +static inline struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); diff --git a/lib/rbtree.c b/lib/rbtree.c index b946eb4b759d..18d42bcf4ec9 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -460,19 +460,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, } EXPORT_SYMBOL(__rb_insert_augmented); -struct rb_node *rb_last(const struct rb_root *root) -{ - struct rb_node *n; - - n = root->rb_node; - if (!n) - return NULL; - while (n->rb_right) - n = n->rb_right; - return n; -} -EXPORT_SYMBOL(rb_last); - struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; diff --git a/rust/helpers/rbtree.c b/rust/helpers/rbtree.c index d0e452234632..2a0eabbb4160 100644 --- a/rust/helpers/rbtree.c +++ b/rust/helpers/rbtree.c @@ -12,3 +12,8 @@ struct rb_node *rust_helper_rb_first(const struct rb_root *root) { return rb_first(root); } + +struct rb_node *rust_helper_rb_last(const struct rb_root *root) +{ + return rb_last(root); +} From 262ef8e55b7ccd435619c1946249131d0f5b72db Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Nov 2025 06:40:15 +0100 Subject: [PATCH 085/139] fork: stop ignoring NUMA while handling cached thread stacks 1. the numa parameter was straight up ignored. 2. nothing was done to check if the to-be-cached/allocated stack matches the local node The id remains ignored on free in case of memoryless nodes. Note the current caching is already bad as the cache keeps overflowing and a different solution is needed for the long run, to be worked out(tm). Stats collected over a kernel build with the patch with the following topology: NUMA node(s): 2 NUMA node0 CPU(s): 0-11 NUMA node1 CPU(s): 12-23 caller's node vs stack backing pages on free: matching: 50083 (70%) mismatched: 21492 (30%) caching efficiency: cached: 32651 (65.2%) dropped: 17432 (34.8%) Link: https://lkml.kernel.org/r/20251120054015.3019419-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Reviewed-by: Linus Walleij Cc: Liam Howlett Cc: Linus Waleij Cc: Lorenzo Stoakes Cc: Pasha Tatashin Cc: Kees Cook Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/fork.c | 63 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..17fcb75ca5d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -208,15 +208,62 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; +static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node) +{ + struct vm_struct *vm_area; + unsigned int i; + + /* + * If the node has memory, we are guaranteed the stacks are backed by local pages. + * Otherwise the pages are arbitrary. + * + * Note that depending on cpuset it is possible we will get migrated to a different + * node immediately after allocating here, so this does *not* guarantee locality for + * arbitrary callers. + */ + scoped_guard(preempt) { + if (node != NUMA_NO_NODE && numa_node_id() != node) + return NULL; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (vm_area) + return vm_area; + } + } + + return NULL; +} + static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; + int nid; - for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *tmp = NULL; + /* + * Don't cache stacks if any of the pages don't match the local domain, unless + * there is no local memory to begin with. + * + * Note that lack of local memory does not automatically mean it makes no difference + * performance-wise which other domain backs the stack. In this case we are merely + * trying to avoid constantly going to vmalloc. + */ + scoped_guard(preempt) { + nid = numa_node_id(); + if (node_state(nid, N_MEMORY)) { + for (i = 0; i < vm_area->nr_pages; i++) { + struct page *page = vm_area->pages[i]; + if (page_to_nid(page) != nid) + return false; + } + } - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) - return true; + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *tmp = NULL; + + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) + return true; + } } return false; } @@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm_area; void *stack; - int i; - - for (i = 0; i < NR_CACHED_STACKS; i++) { - vm_area = this_cpu_xchg(cached_stacks[i], NULL); - if (!vm_area) - continue; + vm_area = alloc_thread_stack_node_from_cache(tsk, node); + if (vm_area) { if (memcg_charge_kernel_stack(vm_area)) { vfree(vm_area->addr); return -ENOMEM; From e6fbd1759c9ece5044d3470f30a5e2166dc9de89 Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Thu, 16 Oct 2025 16:14:09 +0530 Subject: [PATCH 086/139] selftests: complete kselftest include centralization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This follow-up patch completes centralization of kselftest.h and ksefltest_harness.h includes in remaining seltests files, replacing all relative paths with a non-relative paths using shared -I include path in lib.mk Tested with gcc-13.3 and clang-18.1, and cross-compiled successfully on riscv, arm64, x86_64 and powerpc arch. [reddybalavignesh9979@gmail.com: add selftests include path for kselftest.h] Link: https://lkml.kernel.org/r/20251017090201.317521-1-reddybalavignesh9979@gmail.com Link: https://lkml.kernel.org/r/20251016104409.68985-1-reddybalavignesh9979@gmail.com Signed-off-by: Bala-Vignesh-Reddy Suggested-by: Andrew Morton Link: https://lore.kernel.org/lkml/20250820143954.33d95635e504e94df01930d0@linux-foundation.org/ Reviewed-by: Wei Yang Cc: David Hildenbrand Cc: David S. Miller Cc: Eric Dumazet Cc: Günther Noack Cc: Jakub Kacinski Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mickael Salaun Cc: Ming Lei Cc: Paolo Abeni Cc: Shuah Khan Cc: Simon Horman Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- samples/vfs/Makefile | 1 + tools/testing/selftests/acct/acct_syscall.c | 2 +- tools/testing/selftests/alsa/conf.c | 2 +- tools/testing/selftests/alsa/mixer-test.c | 2 +- tools/testing/selftests/alsa/pcm-test.c | 2 +- tools/testing/selftests/alsa/test-pcmtest-driver.c | 2 +- tools/testing/selftests/alsa/utimer-test.c | 2 +- tools/testing/selftests/arm64/abi/hwcap.c | 2 +- tools/testing/selftests/arm64/abi/ptrace.c | 2 +- tools/testing/selftests/arm64/abi/syscall-abi.c | 2 +- tools/testing/selftests/arm64/fp/fp-ptrace.c | 2 +- tools/testing/selftests/arm64/fp/fp-stress.c | 2 +- tools/testing/selftests/arm64/fp/sve-probe-vls.c | 2 +- tools/testing/selftests/arm64/fp/sve-ptrace.c | 2 +- tools/testing/selftests/arm64/fp/vec-syscfg.c | 2 +- tools/testing/selftests/arm64/fp/za-ptrace.c | 2 +- tools/testing/selftests/arm64/fp/zt-ptrace.c | 2 +- tools/testing/selftests/arm64/gcs/gcs-stress.c | 2 +- tools/testing/selftests/arm64/pauth/pac.c | 2 +- tools/testing/selftests/arm64/tags/tags_test.c | 2 +- tools/testing/selftests/bpf/xskxceiver.c | 2 +- tools/testing/selftests/breakpoints/breakpoint_test.c | 2 +- tools/testing/selftests/breakpoints/breakpoint_test_arm64.c | 2 +- tools/testing/selftests/breakpoints/step_after_suspend_test.c | 2 +- tools/testing/selftests/cachestat/test_cachestat.c | 2 +- tools/testing/selftests/capabilities/test_execve.c | 2 +- tools/testing/selftests/capabilities/validate_cap.c | 2 +- tools/testing/selftests/cgroup/test_core.c | 2 +- tools/testing/selftests/cgroup/test_cpu.c | 2 +- tools/testing/selftests/cgroup/test_cpuset.c | 2 +- tools/testing/selftests/cgroup/test_freezer.c | 2 +- tools/testing/selftests/cgroup/test_hugetlb_memcg.c | 2 +- tools/testing/selftests/cgroup/test_kill.c | 2 +- tools/testing/selftests/cgroup/test_kmem.c | 2 +- tools/testing/selftests/cgroup/test_memcontrol.c | 2 +- tools/testing/selftests/cgroup/test_pids.c | 2 +- tools/testing/selftests/cgroup/test_zswap.c | 2 +- tools/testing/selftests/clone3/clone3.c | 2 +- .../testing/selftests/clone3/clone3_cap_checkpoint_restore.c | 2 +- tools/testing/selftests/clone3/clone3_clear_sighand.c | 2 +- tools/testing/selftests/clone3/clone3_selftests.h | 2 +- tools/testing/selftests/clone3/clone3_set_tid.c | 2 +- tools/testing/selftests/connector/proc_filter.c | 2 +- tools/testing/selftests/core/close_range_test.c | 2 +- tools/testing/selftests/core/unshare_test.c | 2 +- tools/testing/selftests/coredump/stackdump_test.c | 2 +- tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c | 2 +- tools/testing/selftests/drivers/dma-buf/udmabuf.c | 2 +- tools/testing/selftests/drivers/ntsync/ntsync.c | 2 +- .../testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c | 2 +- tools/testing/selftests/exec/check-exec.c | 2 +- tools/testing/selftests/exec/execveat.c | 2 +- tools/testing/selftests/exec/load_address.c | 2 +- tools/testing/selftests/exec/non-regular.c | 2 +- tools/testing/selftests/exec/null-argv.c | 2 +- tools/testing/selftests/exec/recursion-depth.c | 2 +- tools/testing/selftests/fchmodat2/fchmodat2_test.c | 2 +- tools/testing/selftests/filelock/ofdlocks.c | 2 +- tools/testing/selftests/filesystems/anon_inode_test.c | 2 +- tools/testing/selftests/filesystems/binderfs/binderfs_test.c | 2 +- tools/testing/selftests/filesystems/devpts_pts.c | 2 +- tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c | 2 +- tools/testing/selftests/filesystems/eventfd/eventfd_test.c | 2 +- tools/testing/selftests/filesystems/fclog.c | 2 +- tools/testing/selftests/filesystems/file_stressor.c | 2 +- tools/testing/selftests/filesystems/fuse/fusectl_test.c | 2 +- tools/testing/selftests/filesystems/kernfs_test.c | 2 +- .../selftests/filesystems/mount-notify/mount-notify_test.c | 2 +- .../selftests/filesystems/mount-notify/mount-notify_test_ns.c | 2 +- tools/testing/selftests/filesystems/nsfs/iterate_mntns.c | 2 +- tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c | 2 +- .../selftests/filesystems/overlayfs/set_layers_via_fds.c | 2 +- .../testing/selftests/filesystems/statmount/listmount_test.c | 2 +- .../testing/selftests/filesystems/statmount/statmount_test.c | 2 +- .../selftests/filesystems/statmount/statmount_test_ns.c | 2 +- tools/testing/selftests/filesystems/utils.c | 2 +- tools/testing/selftests/futex/functional/futex_numa_mpol.c | 2 +- tools/testing/selftests/futex/functional/futex_priv_hash.c | 2 +- tools/testing/selftests/futex/functional/futex_requeue.c | 2 +- tools/testing/selftests/futex/functional/futex_requeue_pi.c | 2 +- .../futex/functional/futex_requeue_pi_mismatched_ops.c | 2 +- .../futex/functional/futex_requeue_pi_signal_restart.c | 2 +- tools/testing/selftests/futex/functional/futex_wait.c | 2 +- .../futex/functional/futex_wait_private_mapped_file.c | 2 +- tools/testing/selftests/futex/functional/futex_wait_timeout.c | 2 +- .../futex/functional/futex_wait_uninitialized_heap.c | 2 +- .../selftests/futex/functional/futex_wait_wouldblock.c | 2 +- tools/testing/selftests/futex/functional/futex_waitv.c | 2 +- tools/testing/selftests/hid/hid_common.h | 2 +- tools/testing/selftests/intel_pstate/aperf.c | 2 +- tools/testing/selftests/iommu/iommufd_utils.h | 2 +- tools/testing/selftests/ipc/msgque.c | 2 +- tools/testing/selftests/ir/ir_loopback.c | 2 +- tools/testing/selftests/kcmp/kcmp_test.c | 2 +- tools/testing/selftests/kselftest_harness.h | 2 +- tools/testing/selftests/kselftest_harness/harness-selftest.c | 2 +- tools/testing/selftests/landlock/audit.h | 2 +- tools/testing/selftests/landlock/common.h | 2 +- tools/testing/selftests/lib.mk | 3 +++ tools/testing/selftests/lsm/lsm_get_self_attr_test.c | 2 +- tools/testing/selftests/lsm/lsm_list_modules_test.c | 2 +- tools/testing/selftests/lsm/lsm_set_self_attr_test.c | 2 +- tools/testing/selftests/media_tests/media_device_open.c | 2 +- tools/testing/selftests/media_tests/media_device_test.c | 2 +- tools/testing/selftests/membarrier/membarrier_test_impl.h | 2 +- tools/testing/selftests/mincore/mincore_selftest.c | 4 ++-- tools/testing/selftests/mm/compaction_test.c | 2 +- tools/testing/selftests/mm/cow.c | 2 +- tools/testing/selftests/mm/droppable.c | 2 +- tools/testing/selftests/mm/guard-regions.c | 2 +- tools/testing/selftests/mm/gup_longterm.c | 2 +- tools/testing/selftests/mm/gup_test.c | 2 +- tools/testing/selftests/mm/hmm-tests.c | 2 +- tools/testing/selftests/mm/hugepage-mmap.c | 2 +- tools/testing/selftests/mm/hugepage-mremap.c | 2 +- tools/testing/selftests/mm/hugetlb-madvise.c | 2 +- tools/testing/selftests/mm/hugetlb-read-hwpoison.c | 2 +- tools/testing/selftests/mm/hugetlb-soft-offline.c | 2 +- tools/testing/selftests/mm/hugetlb_dio.c | 2 +- tools/testing/selftests/mm/hugetlb_fault_after_madv.c | 2 +- tools/testing/selftests/mm/hugetlb_madv_vs_map.c | 2 +- tools/testing/selftests/mm/ksm_functional_tests.c | 2 +- tools/testing/selftests/mm/ksm_tests.c | 2 +- tools/testing/selftests/mm/madv_populate.c | 2 +- tools/testing/selftests/mm/map_fixed_noreplace.c | 2 +- tools/testing/selftests/mm/map_hugetlb.c | 2 +- tools/testing/selftests/mm/map_populate.c | 2 +- tools/testing/selftests/mm/mdwe_test.c | 2 +- tools/testing/selftests/mm/memfd_secret.c | 2 +- tools/testing/selftests/mm/merge.c | 2 +- tools/testing/selftests/mm/migration.c | 2 +- tools/testing/selftests/mm/mkdirty.c | 2 +- tools/testing/selftests/mm/mlock-random-test.c | 2 +- tools/testing/selftests/mm/mlock2-tests.c | 2 +- tools/testing/selftests/mm/mrelease_test.c | 2 +- tools/testing/selftests/mm/mremap_dontunmap.c | 2 +- tools/testing/selftests/mm/mremap_test.c | 2 +- tools/testing/selftests/mm/mseal_test.c | 2 +- tools/testing/selftests/mm/on-fault-limit.c | 2 +- tools/testing/selftests/mm/pagemap_ioctl.c | 2 +- tools/testing/selftests/mm/pfnmap.c | 2 +- tools/testing/selftests/mm/pkey-helpers.h | 2 +- tools/testing/selftests/mm/prctl_thp_disable.c | 2 +- tools/testing/selftests/mm/process_madv.c | 2 +- tools/testing/selftests/mm/rmap.c | 2 +- tools/testing/selftests/mm/soft-dirty.c | 2 +- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- tools/testing/selftests/mm/thuge-gen.c | 2 +- tools/testing/selftests/mm/transhuge-stress.c | 2 +- tools/testing/selftests/mm/uffd-common.h | 2 +- tools/testing/selftests/mm/uffd-wp-mremap.c | 2 +- tools/testing/selftests/mm/va_high_addr_switch.c | 2 +- tools/testing/selftests/mm/virtual_address_range.c | 2 +- tools/testing/selftests/mm/vm_util.c | 2 +- tools/testing/selftests/mm/vm_util.h | 2 +- tools/testing/selftests/mount_setattr/mount_setattr_test.c | 2 +- .../move_mount_set_group/move_mount_set_group_test.c | 2 +- tools/testing/selftests/mqueue/mq_open_tests.c | 2 +- tools/testing/selftests/mqueue/mq_perf_tests.c | 2 +- .../selftests/mseal_system_mappings/sysmap_is_sealed.c | 4 ++-- tools/testing/selftests/namespaces/file_handle_test.c | 2 +- tools/testing/selftests/namespaces/init_ino_test.c | 2 +- tools/testing/selftests/namespaces/nsid_test.c | 2 +- tools/testing/selftests/nci/nci_dev.c | 2 +- tools/testing/selftests/net/af_unix/diag_uid.c | 2 +- tools/testing/selftests/net/af_unix/msg_oob.c | 2 +- tools/testing/selftests/net/af_unix/scm_inq.c | 2 +- tools/testing/selftests/net/af_unix/scm_pidfd.c | 2 +- tools/testing/selftests/net/af_unix/scm_rights.c | 2 +- tools/testing/selftests/net/af_unix/unix_connect.c | 2 +- tools/testing/selftests/net/bind_timewait.c | 2 +- tools/testing/selftests/net/bind_wildcard.c | 2 +- tools/testing/selftests/net/can/test_raw_filter.c | 2 +- tools/testing/selftests/net/cmsg_sender.c | 2 +- tools/testing/selftests/net/epoll_busy_poll.c | 2 +- tools/testing/selftests/net/gro.c | 2 +- tools/testing/selftests/net/ip_local_port_range.c | 2 +- tools/testing/selftests/net/ipsec.c | 2 +- tools/testing/selftests/net/ipv6_fragmentation.c | 2 +- tools/testing/selftests/net/netfilter/conntrack_dump_flush.c | 2 +- tools/testing/selftests/net/netlink-dumps.c | 2 +- tools/testing/selftests/net/ovpn/ovpn-cli.c | 2 +- tools/testing/selftests/net/proc_net_pktgen.c | 2 +- tools/testing/selftests/net/psock_fanout.c | 2 +- tools/testing/selftests/net/psock_tpacket.c | 2 +- tools/testing/selftests/net/reuseaddr_ports_exhausted.c | 2 +- tools/testing/selftests/net/reuseport_bpf.c | 2 +- tools/testing/selftests/net/reuseport_bpf_numa.c | 2 +- tools/testing/selftests/net/rxtimestamp.c | 2 +- tools/testing/selftests/net/sk_so_peek_off.c | 2 +- tools/testing/selftests/net/so_incoming_cpu.c | 2 +- tools/testing/selftests/net/socket.c | 2 +- tools/testing/selftests/net/tap.c | 2 +- tools/testing/selftests/net/tcp_ao/lib/setup.c | 2 +- tools/testing/selftests/net/tcp_fastopen_backup_key.c | 2 +- tools/testing/selftests/net/tcp_port_share.c | 2 +- tools/testing/selftests/net/tls.c | 2 +- tools/testing/selftests/net/toeplitz.c | 2 +- tools/testing/selftests/net/tun.c | 2 +- tools/testing/selftests/net/udpgso_bench_tx.c | 2 +- tools/testing/selftests/openat2/helpers.h | 2 +- tools/testing/selftests/openat2/openat2_test.c | 2 +- tools/testing/selftests/openat2/rename_attack_test.c | 2 +- tools/testing/selftests/openat2/resolve_test.c | 2 +- tools/testing/selftests/pci_endpoint/pci_endpoint_test.c | 2 +- tools/testing/selftests/perf_events/mmap.c | 2 +- tools/testing/selftests/perf_events/remove_on_exec.c | 2 +- tools/testing/selftests/perf_events/sigtrap_threads.c | 2 +- tools/testing/selftests/perf_events/watermark_signal.c | 2 +- tools/testing/selftests/pid_namespace/pid_max.c | 2 +- tools/testing/selftests/pid_namespace/regression_enomem.c | 2 +- tools/testing/selftests/pidfd/pidfd.h | 2 +- tools/testing/selftests/pidfd/pidfd_bind_mount.c | 2 +- tools/testing/selftests/pidfd/pidfd_fdinfo_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_file_handle_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_getfd_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_info_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_open_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_poll_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_setattr_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_setns_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_wait.c | 2 +- tools/testing/selftests/pidfd/pidfd_xattr_test.c | 2 +- tools/testing/selftests/prctl/set-anon-vma-name-test.c | 2 +- tools/testing/selftests/prctl/set-process-name.c | 2 +- tools/testing/selftests/proc/proc-maps-race.c | 2 +- tools/testing/selftests/proc/proc-pid-vm.c | 2 +- tools/testing/selftests/proc/proc-pidns.c | 2 +- tools/testing/selftests/ptrace/get_set_sud.c | 2 +- tools/testing/selftests/ptrace/get_syscall_info.c | 2 +- tools/testing/selftests/ptrace/set_syscall_info.c | 2 +- tools/testing/selftests/ptrace/vmaccess.c | 2 +- tools/testing/selftests/resctrl/resctrl.h | 2 +- tools/testing/selftests/ring-buffer/map_test.c | 2 +- tools/testing/selftests/riscv/abi/pointer_masking.c | 2 +- tools/testing/selftests/riscv/hwprobe/cbo.c | 2 +- tools/testing/selftests/riscv/hwprobe/hwprobe.c | 2 +- tools/testing/selftests/riscv/hwprobe/which-cpus.c | 2 +- tools/testing/selftests/riscv/mm/mmap_bottomup.c | 2 +- tools/testing/selftests/riscv/mm/mmap_default.c | 2 +- tools/testing/selftests/riscv/mm/mmap_test.h | 2 +- tools/testing/selftests/riscv/sigreturn/sigreturn.c | 2 +- tools/testing/selftests/riscv/vector/v_initval.c | 2 +- tools/testing/selftests/riscv/vector/vstate_prctl.c | 2 +- tools/testing/selftests/rseq/basic_percpu_ops_test.c | 2 +- tools/testing/selftests/rseq/rseq.c | 2 +- tools/testing/selftests/rtc/rtctest.c | 2 +- tools/testing/selftests/seccomp/seccomp_benchmark.c | 2 +- tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +- tools/testing/selftests/sgx/main.c | 2 +- tools/testing/selftests/signal/mangle_uc_sigmask.c | 2 +- tools/testing/selftests/signal/sas.c | 2 +- tools/testing/selftests/sparc64/drivers/adi-test.c | 2 +- tools/testing/selftests/sync/sync_test.c | 2 +- tools/testing/selftests/syscall_user_dispatch/sud_test.c | 2 +- tools/testing/selftests/tdx/tdx_guest_test.c | 2 +- tools/testing/selftests/timens/timens.h | 2 +- tools/testing/selftests/timers/adjtick.c | 2 +- tools/testing/selftests/timers/alarmtimer-suspend.c | 2 +- tools/testing/selftests/timers/change_skew.c | 2 +- tools/testing/selftests/timers/clocksource-switch.c | 2 +- tools/testing/selftests/timers/freq-step.c | 2 +- tools/testing/selftests/timers/inconsistency-check.c | 2 +- tools/testing/selftests/timers/leap-a-day.c | 2 +- tools/testing/selftests/timers/leapcrash.c | 2 +- tools/testing/selftests/timers/mqueue-lat.c | 2 +- tools/testing/selftests/timers/nanosleep.c | 2 +- tools/testing/selftests/timers/nsleep-lat.c | 2 +- tools/testing/selftests/timers/posix_timers.c | 2 +- tools/testing/selftests/timers/raw_skew.c | 2 +- tools/testing/selftests/timers/rtcpie.c | 2 +- tools/testing/selftests/timers/set-2038.c | 2 +- tools/testing/selftests/timers/set-tai.c | 2 +- tools/testing/selftests/timers/set-timer-lat.c | 2 +- tools/testing/selftests/timers/set-tz.c | 2 +- tools/testing/selftests/timers/skew_consistency.c | 2 +- tools/testing/selftests/timers/threadtest.c | 2 +- tools/testing/selftests/timers/valid-adjtimex.c | 2 +- tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c | 2 +- tools/testing/selftests/tty/tty_tstamp_update.c | 2 +- tools/testing/selftests/uevent/uevent_filtering.c | 2 +- tools/testing/selftests/user_events/abi_test.c | 2 +- tools/testing/selftests/user_events/dyn_test.c | 2 +- tools/testing/selftests/user_events/ftrace_test.c | 2 +- tools/testing/selftests/user_events/perf_test.c | 2 +- tools/testing/selftests/user_events/user_events_selftests.h | 2 +- tools/testing/selftests/vDSO/vdso_test_abi.c | 2 +- tools/testing/selftests/vDSO/vdso_test_chacha.c | 2 +- tools/testing/selftests/vDSO/vdso_test_correctness.c | 2 +- tools/testing/selftests/vDSO/vdso_test_getcpu.c | 2 +- tools/testing/selftests/vDSO/vdso_test_getrandom.c | 2 +- tools/testing/selftests/vDSO/vdso_test_gettimeofday.c | 2 +- tools/testing/selftests/vfio/lib/include/vfio_util.h | 2 +- tools/testing/selftests/vfio/lib/vfio_pci_device.c | 2 +- tools/testing/selftests/vfio/lib/vfio_pci_driver.c | 2 +- tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 2 +- tools/testing/selftests/vfio/vfio_iommufd_setup_test.c | 2 +- tools/testing/selftests/vfio/vfio_pci_device_test.c | 2 +- tools/testing/selftests/vfio/vfio_pci_driver_test.c | 2 +- tools/testing/selftests/x86/corrupt_xstate_header.c | 2 +- tools/testing/selftests/x86/helpers.h | 2 +- tools/testing/selftests/x86/lam.c | 2 +- tools/testing/selftests/x86/syscall_numbering.c | 2 +- tools/testing/selftests/x86/test_mremap_vdso.c | 2 +- tools/testing/selftests/x86/test_vsyscall.c | 2 +- tools/testing/selftests/x86/xstate.h | 2 +- 307 files changed, 311 insertions(+), 307 deletions(-) diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile index 6554b73a75c8..9256ca5d762b 100644 --- a/samples/vfs/Makefile +++ b/samples/vfs/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts +userccflags += -I $(srctree)/tools/testing/selftests/ userccflags += -I usr/include diff --git a/tools/testing/selftests/acct/acct_syscall.c b/tools/testing/selftests/acct/acct_syscall.c index 87c044fb9293..421adbdc299d 100644 --- a/tools/testing/selftests/acct/acct_syscall.c +++ b/tools/testing/selftests/acct/acct_syscall.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(void) { diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c index e2b3a5810f47..5b7c83fe87b3 100644 --- a/tools/testing/selftests/alsa/conf.c +++ b/tools/testing/selftests/alsa/conf.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "alsa-local.h" #define SYSFS_ROOT "/sys" diff --git a/tools/testing/selftests/alsa/mixer-test.c b/tools/testing/selftests/alsa/mixer-test.c index e113dafa5c24..d4f845c32804 100644 --- a/tools/testing/selftests/alsa/mixer-test.c +++ b/tools/testing/selftests/alsa/mixer-test.c @@ -25,7 +25,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "alsa-local.h" #define TESTS_PER_CONTROL 7 diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c index ce92548670c8..ee04ccef7d7c 100644 --- a/tools/testing/selftests/alsa/pcm-test.c +++ b/tools/testing/selftests/alsa/pcm-test.c @@ -17,7 +17,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "alsa-local.h" typedef struct timespec timestamp_t; diff --git a/tools/testing/selftests/alsa/test-pcmtest-driver.c b/tools/testing/selftests/alsa/test-pcmtest-driver.c index ca81afa4ee90..95065ef3b441 100644 --- a/tools/testing/selftests/alsa/test-pcmtest-driver.c +++ b/tools/testing/selftests/alsa/test-pcmtest-driver.c @@ -7,7 +7,7 @@ */ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define CH_NUM 4 diff --git a/tools/testing/selftests/alsa/utimer-test.c b/tools/testing/selftests/alsa/utimer-test.c index 37964f311a33..c45cb226bd8f 100644 --- a/tools/testing/selftests/alsa/utimer-test.c +++ b/tools/testing/selftests/alsa/utimer-test.c @@ -6,7 +6,7 @@ * * Author: Ivan Orlov */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 3b96d090c5eb..c41640f18e4e 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -19,7 +19,7 @@ #include -#include "../../kselftest.h" +#include "kselftest.h" #define TESTS_PER_HWCAP 3 diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c index b51d21f78cf9..0e46ac21c81d 100644 --- a/tools/testing/selftests/arm64/abi/ptrace.c +++ b/tools/testing/selftests/arm64/abi/ptrace.c @@ -18,7 +18,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #define EXPECTED_TESTS 11 diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c index 5ec9a18ec802..b67e3e26fa6d 100644 --- a/tools/testing/selftests/arm64/abi/syscall-abi.c +++ b/tools/testing/selftests/arm64/abi/syscall-abi.c @@ -16,7 +16,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #include "syscall-abi.h" diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index a85c19e9524e..d366934e3d5f 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -27,7 +27,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #include "fp-ptrace.h" diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 9349aa630c84..65e01aba96ff 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -24,7 +24,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #define MAX_VLS 16 diff --git a/tools/testing/selftests/arm64/fp/sve-probe-vls.c b/tools/testing/selftests/arm64/fp/sve-probe-vls.c index a24eca7a4ecb..df0c1b6eb114 100644 --- a/tools/testing/selftests/arm64/fp/sve-probe-vls.c +++ b/tools/testing/selftests/arm64/fp/sve-probe-vls.c @@ -12,7 +12,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #include "rdvl.h" int main(int argc, char **argv) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index e0fc3a001e28..645c62de0f90 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -19,7 +19,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" /* and don't like each other, so: */ #ifndef NT_ARM_SVE diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index 2d75d342eeb9..8dd932fdcdc4 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -19,7 +19,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #include "rdvl.h" #define ARCH_MIN_VL SVE_VL_MIN diff --git a/tools/testing/selftests/arm64/fp/za-ptrace.c b/tools/testing/selftests/arm64/fp/za-ptrace.c index 08c777f87ea2..787eed22d059 100644 --- a/tools/testing/selftests/arm64/fp/za-ptrace.c +++ b/tools/testing/selftests/arm64/fp/za-ptrace.c @@ -18,7 +18,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" /* and don't like each other, so: */ #ifndef NT_ARM_ZA diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index a7f34040fbf1..f3fa49fd0fbd 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -18,7 +18,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" /* and don't like each other, so: */ #ifndef NT_ARM_ZA diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index cf316d78ea97..86d8cd42aee7 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -24,7 +24,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" struct child_data { char *name, *output; diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index 6d21b2fc758d..67d138057707 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -10,7 +10,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #include "helper.h" #define PAC_COLLISION_ATTEMPTS 1000 diff --git a/tools/testing/selftests/arm64/tags/tags_test.c b/tools/testing/selftests/arm64/tags/tags_test.c index 8ae26e496c89..375ab47f0edb 100644 --- a/tools/testing/selftests/arm64/tags/tags_test.c +++ b/tools/testing/selftests/arm64/tags/tags_test.c @@ -6,7 +6,7 @@ #include #include #include -#include "../../kselftest.h" +#include "kselftest.h" #define SHIFT_TAG(tag) ((uint64_t)(tag) << 56) #define SET_TAG(ptr, tag) (((uint64_t)(ptr) & ~SHIFT_TAG(0xff)) | \ diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 352adc8df2d1..03ce85870035 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -104,7 +104,7 @@ #include "xskxceiver.h" #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "xsk_xdp_common.h" #include diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c index d46962a24724..1159d81890c2 100644 --- a/tools/testing/selftests/breakpoints/breakpoint_test.c +++ b/tools/testing/selftests/breakpoints/breakpoint_test.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define COUNT_ISN_BPS 4 #define COUNT_WPS 4 diff --git a/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c b/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c index e7041816085a..5fc0f37f3fd4 100644 --- a/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c +++ b/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c @@ -26,7 +26,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static volatile uint8_t var[96] __attribute__((__aligned__(32))); diff --git a/tools/testing/selftests/breakpoints/step_after_suspend_test.c b/tools/testing/selftests/breakpoints/step_after_suspend_test.c index 8d233ac95696..ca2aaab9e4ca 100644 --- a/tools/testing/selftests/breakpoints/step_after_suspend_test.c +++ b/tools/testing/selftests/breakpoints/step_after_suspend_test.c @@ -19,7 +19,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" void child(int cpu) { diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index ab838bcb9ec5..542cd09cb443 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define NR_TESTS 9 diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c index 47bad7ddc5bc..46fc8d46b6e6 100644 --- a/tools/testing/selftests/capabilities/test_execve.c +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static int nerrs; static pid_t mpid; /* main() pid is used to avoid duplicate test counts */ diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c index 65f2a1c89239..cef1d9937b9f 100644 --- a/tools/testing/selftests/capabilities/validate_cap.c +++ b/tools/testing/selftests/capabilities/validate_cap.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19) # define HAVE_GETAUXVAL diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index a360e2eb2eef..5e5b8c4b8c0e 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -17,7 +17,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" static bool nsdelegate; diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index d54e2317efff..7d77d3d43c8e 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" enum hog_clock_type { diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c index 4034d14ba69a..8094091a5857 100644 --- a/tools/testing/selftests/cgroup/test_cpuset.c +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -3,7 +3,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" static int idle_process_fn(const char *cgroup, void *arg) diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index dfb763819581..714c963aa3f5 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" #define DEBUG diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c index 856f9508ea56..f451aa449be6 100644 --- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -7,7 +7,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" #define ADDR ((void *)(0x0UL)) diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c index 0e5bb6c7307a..a4dd326ced79 100644 --- a/tools/testing/selftests/cgroup/test_kill.c +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "../pidfd/pidfd.h" #include "cgroup_util.h" diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 63b3c9aad399..005a142f3492 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index a680f773f2d5..2e9d78ab641c 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" static bool has_localevents; diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c index d8a1d1cd5007..9a387c815d2c 100644 --- a/tools/testing/selftests/cgroup/test_pids.c +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" static int run_success(const char *cgroup, void *arg) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index e1f578ca2841..ab865d900791 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -10,7 +10,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" static int read_int(const char *path, size_t *value) diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index e61f07973ce5..289e0c7c1f09 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "clone3_selftests.h" enum test_mode { diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c index 3c196fa86c99..e82281efa273 100644 --- a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c +++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c @@ -24,7 +24,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "clone3_selftests.h" static void child_exit(int ret) diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index ce0426786828..de0c9d62015d 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -13,7 +13,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "clone3_selftests.h" static void nop_handler(int signo) diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index eeca8005723f..a0593e8950f0 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -11,7 +11,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index bfb0da2b4fdd..5c944aee6b41 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -20,7 +20,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "clone3_selftests.h" #define MAX_PID_NS_LEVEL 32 diff --git a/tools/testing/selftests/connector/proc_filter.c b/tools/testing/selftests/connector/proc_filter.c index 4a825b997666..36c11467a8f1 100644 --- a/tools/testing/selftests/connector/proc_filter.c +++ b/tools/testing/selftests/connector/proc_filter.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define NL_MESSAGE_SIZE (sizeof(struct nlmsghdr) + sizeof(struct cn_msg) + \ sizeof(struct proc_input)) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index e0d9851fe1c9..f14eca63f20c 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../clone3/clone3_selftests.h" diff --git a/tools/testing/selftests/core/unshare_test.c b/tools/testing/selftests/core/unshare_test.c index 7fec9dfb1b0e..ffce75a6c228 100644 --- a/tools/testing/selftests/core/unshare_test.c +++ b/tools/testing/selftests/core/unshare_test.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../clone3/clone3_selftests.h" TEST(unshare_EMFILE) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index a4ac80bb1003..68339f7e6b56 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -19,7 +19,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../filesystems/wrappers.h" #include "../pidfd/pidfd.h" diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index 5d0a809dc2df..fc9694fc4e89 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -15,7 +15,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define DEVPATH "/dev/dma_heap" diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c index 77aa2897e79f..d78aec662586 100644 --- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c +++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c @@ -16,7 +16,7 @@ #include #include #include -#include "../../kselftest.h" +#include "kselftest.h" #define TEST_PREFIX "drivers/dma-buf/udmabuf" #define NUM_PAGES 4 diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 3aad311574c4..e6a37214aa46 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -12,7 +12,7 @@ #include #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" static int read_sem_state(int sem, __u32 *count, __u32 *max) { diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c index 7ee7492138c6..14df9aa07308 100644 --- a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c +++ b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c @@ -14,7 +14,7 @@ #include -#include "../../../kselftest_harness.h" +#include "kselftest_harness.h" #define UV_PATH "/dev/uv" #define BUFFER_SIZE 0x200 diff --git a/tools/testing/selftests/exec/check-exec.c b/tools/testing/selftests/exec/check-exec.c index 55bce47e56b7..f2397e75aa7c 100644 --- a/tools/testing/selftests/exec/check-exec.c +++ b/tools/testing/selftests/exec/check-exec.c @@ -30,7 +30,7 @@ #define _ASM_GENERIC_FCNTL_H #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static int sys_execveat(int dirfd, const char *pathname, char *const argv[], char *const envp[], int flags) diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index 8fb7395fd35b..d37c068ed5fe 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -21,7 +21,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define TESTS_EXPECTED 54 #define TEST_NAME_LEN (PATH_MAX * 4) diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c index 8257fddba8c8..55fd3732f029 100644 --- a/tools/testing/selftests/exec/load_address.c +++ b/tools/testing/selftests/exec/load_address.c @@ -6,7 +6,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" struct Statistics { unsigned long long load_address; diff --git a/tools/testing/selftests/exec/non-regular.c b/tools/testing/selftests/exec/non-regular.c index cd3a34aca93e..14ac36487df5 100644 --- a/tools/testing/selftests/exec/non-regular.c +++ b/tools/testing/selftests/exec/non-regular.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" /* Remove a file, ignoring the result if it didn't exist. */ void rm(struct __test_metadata *_metadata, const char *pathname, diff --git a/tools/testing/selftests/exec/null-argv.c b/tools/testing/selftests/exec/null-argv.c index c19726e710d1..4940aee5bb38 100644 --- a/tools/testing/selftests/exec/null-argv.c +++ b/tools/testing/selftests/exec/null-argv.c @@ -5,7 +5,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define FORK(exec) \ do { \ diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c index 438c8ff2fd26..7b5c4f6d1928 100644 --- a/tools/testing/selftests/exec/recursion-depth.c +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -23,7 +23,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(void) { diff --git a/tools/testing/selftests/fchmodat2/fchmodat2_test.c b/tools/testing/selftests/fchmodat2/fchmodat2_test.c index e0319417124d..e397339495f6 100644 --- a/tools/testing/selftests/fchmodat2/fchmodat2_test.c +++ b/tools/testing/selftests/fchmodat2/fchmodat2_test.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" int sys_fchmodat2(int dfd, const char *filename, mode_t mode, int flags) { diff --git a/tools/testing/selftests/filelock/ofdlocks.c b/tools/testing/selftests/filelock/ofdlocks.c index a55b79810ab2..ff8d47fc373a 100644 --- a/tools/testing/selftests/filelock/ofdlocks.c +++ b/tools/testing/selftests/filelock/ofdlocks.c @@ -6,7 +6,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" static int lock_set(int fd, struct flock *fl) { diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c index 73e0a4d4fb2f..94c6c81c2301 100644 --- a/tools/testing/selftests/filesystems/anon_inode_test.c +++ b/tools/testing/selftests/filesystems/anon_inode_test.c @@ -6,7 +6,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "wrappers.h" TEST(anon_inode_no_chown) diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c index 39a68078a79b..a1a79a6fef17 100644 --- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c +++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c @@ -21,7 +21,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define DEFAULT_THREADS 4 diff --git a/tools/testing/selftests/filesystems/devpts_pts.c b/tools/testing/selftests/filesystems/devpts_pts.c index b1fc9b916ace..54fea349204e 100644 --- a/tools/testing/selftests/filesystems/devpts_pts.c +++ b/tools/testing/selftests/filesystems/devpts_pts.c @@ -11,7 +11,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" static bool terminal_dup2(int duplicate, int original) { diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index 65ede506305c..8bc57a2ef966 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -11,7 +11,7 @@ #include #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" struct epoll_mtcontext { diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c index 72d51ad0ee0e..1b48f267157d 100644 --- a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c +++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c @@ -11,7 +11,7 @@ #include #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define EVENTFD_TEST_ITERATIONS 100000UL diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c index 912a8b755c3b..551c4a0f395a 100644 --- a/tools/testing/selftests/filesystems/fclog.c +++ b/tools/testing/selftests/filesystems/fclog.c @@ -13,7 +13,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define ASSERT_ERRNO(expected, _t, seen) \ __EXPECT(expected, #expected, \ diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c index 01dd89f8e52f..141badd671a9 100644 --- a/tools/testing/selftests/filesystems/file_stressor.c +++ b/tools/testing/selftests/filesystems/file_stressor.c @@ -12,7 +12,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include diff --git a/tools/testing/selftests/filesystems/fuse/fusectl_test.c b/tools/testing/selftests/filesystems/fuse/fusectl_test.c index 8d124d1cacb2..0d1d012c35ed 100644 --- a/tools/testing/selftests/filesystems/fuse/fusectl_test.c +++ b/tools/testing/selftests/filesystems/fuse/fusectl_test.c @@ -17,7 +17,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define FUSECTL_MOUNTPOINT "/sys/fs/fuse/connections" #define FUSE_MOUNTPOINT "/tmp/fuse_mnt_XXXXXX" diff --git a/tools/testing/selftests/filesystems/kernfs_test.c b/tools/testing/selftests/filesystems/kernfs_test.c index 16538b3b318e..84c2b910a60d 100644 --- a/tools/testing/selftests/filesystems/kernfs_test.c +++ b/tools/testing/selftests/filesystems/kernfs_test.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "wrappers.h" TEST(kernfs_listxattr) diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index e4b7c2b457ee..6381af6a40e3 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -19,7 +19,7 @@ typedef struct { #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c index 9f57ca46e3af..320ee25dc8a5 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c @@ -19,7 +19,7 @@ typedef struct { #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c index a3d8015897e9..61e55dfbf121 100644 --- a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -12,7 +12,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define MNT_NS_COUNT 11 #define MNT_NS_LAST_INDEX 10 diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c index 31db54b00e64..8924cea6aa4b 100644 --- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -15,7 +15,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #include "log.h" #include "../wrappers.h" diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c index dc0449fa628f..3c0b93183348 100644 --- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c +++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c @@ -12,7 +12,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #include "../../pidfd/pidfd.h" #include "log.h" #include "../utils.h" diff --git a/tools/testing/selftests/filesystems/statmount/listmount_test.c b/tools/testing/selftests/filesystems/statmount/listmount_test.c index 15f0834f7557..8bc82f38c42f 100644 --- a/tools/testing/selftests/filesystems/statmount/listmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/listmount_test.c @@ -11,7 +11,7 @@ #include #include "statmount.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef LISTMOUNT_REVERSE #define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index f048042e53e9..6e53430423d2 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -13,7 +13,7 @@ #include #include "statmount.h" -#include "../../kselftest.h" +#include "kselftest.h" static const char *const known_fs[] = { "9p", "adfs", "affs", "afs", "aio", "anon_inodefs", "apparmorfs", diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index 605a3fa16bf7..d56d4103182f 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -15,7 +15,7 @@ #include "statmount.h" #include "../utils.h" -#include "../../kselftest.h" +#include "kselftest.h" #define NSID_PASS 0 #define NSID_FAIL 1 diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c43a69dffd83..b61b7288ed42 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -20,7 +20,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "wrappers.h" #include "utils.h" diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index d037a3f10ee8..ab8555752137 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -18,7 +18,7 @@ #include "futextest.h" #include "futex2test.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define MAX_THREADS 64 diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index 3b7b5851f290..e8079d7c65e8 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -14,7 +14,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define MAX_THREADS 64 diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c b/tools/testing/selftests/futex/functional/futex_requeue.c index 69e2555b6039..35d4be23db5d 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue.c +++ b/tools/testing/selftests/futex/functional/futex_requeue.c @@ -9,7 +9,7 @@ #include #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c index f299d75848cd..46d2858e15a8 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -29,7 +29,7 @@ #include "atomic.h" #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define MAX_WAKE_ITERS 1000 #define THREAD_MAX 10 diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c index 77135a22a583..f686e605359c 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c @@ -25,7 +25,7 @@ #include #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index e34ee0f9ebcc..a18ccae73eb1 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -27,7 +27,7 @@ #include "atomic.h" #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define DELAY_US 100 diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c index 152ca4612886..0e69c53524c1 100644 --- a/tools/testing/selftests/futex/functional/futex_wait.c +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -11,7 +11,7 @@ #include #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c index 8952ebda14ab..2a749f9b14eb 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c +++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c @@ -28,7 +28,7 @@ #include #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define PAGE_SZ 4096 diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index 0c8766aced2e..674dd13af421 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -19,7 +19,7 @@ #include "futextest.h" #include "futex2test.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" static long timeout_ns = 100000; /* 100us default timeout */ static futex_t futex_pi; diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c index ce2301500d83..b07d68a67f31 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c +++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c @@ -30,7 +30,7 @@ #include #include "futextest.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define WAIT_US 5000000 diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 36b7a54a4085..9ff936ecf164 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -24,7 +24,7 @@ #include "futextest.h" #include "futex2test.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define timeout_ns 100000 diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c index c684b10eb76e..d60876164d4b 100644 --- a/tools/testing/selftests/futex/functional/futex_waitv.c +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -18,7 +18,7 @@ #include "futextest.h" #include "futex2test.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define WAKE_WAIT_US 10000 #define NR_FUTEXES 30 diff --git a/tools/testing/selftests/hid/hid_common.h b/tools/testing/selftests/hid/hid_common.h index 8085519c47cb..e3b267446fa0 100644 --- a/tools/testing/selftests/hid/hid_common.h +++ b/tools/testing/selftests/hid/hid_common.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2022-2024 Red Hat */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include diff --git a/tools/testing/selftests/intel_pstate/aperf.c b/tools/testing/selftests/intel_pstate/aperf.c index a8acf3996973..953b63e5aa6a 100644 --- a/tools/testing/selftests/intel_pstate/aperf.c +++ b/tools/testing/selftests/intel_pstate/aperf.c @@ -11,7 +11,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MSEC_PER_SEC 1000L #define NSEC_PER_MSEC 1000000L diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 9f472c20c190..37876174511b 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -11,7 +11,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../../../../drivers/iommu/iommufd/iommufd_test.h" /* Hack to make assertions more readable */ diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c index 5e36aeeb9901..e107379d185c 100644 --- a/tools/testing/selftests/ipc/msgque.c +++ b/tools/testing/selftests/ipc/msgque.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MAX_MSG_SIZE 32 diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index f4a15cbdd5ea..adfcf50b1264 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -23,7 +23,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define TEST_SCANCODES 10 #define SYSFS_PATH_MAX 256 diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c index d7a8e321bb16..79aa438b7479 100644 --- a/tools/testing/selftests/kcmp/kcmp_test.c +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static long sys_kcmp(int pid1, int pid2, int type, unsigned long fd1, unsigned long fd2) { diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 3f66e862e83e..baae6b7ded41 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -14,7 +14,7 @@ * * .. code-block:: c * - * #include "../kselftest_harness.h" + * #include "kselftest_harness.h" * * TEST(standalone_test) { * do_some_stuff; diff --git a/tools/testing/selftests/kselftest_harness/harness-selftest.c b/tools/testing/selftests/kselftest_harness/harness-selftest.c index b555493bdb4d..7820bb5d0e6d 100644 --- a/tools/testing/selftests/kselftest_harness/harness-selftest.c +++ b/tools/testing/selftests/kselftest_harness/harness-selftest.c @@ -8,7 +8,7 @@ /* Avoid any inconsistencies */ #define TH_LOG_STREAM stdout -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static void test_helper(struct __test_metadata *_metadata) { diff --git a/tools/testing/selftests/landlock/audit.h b/tools/testing/selftests/landlock/audit.h index 02fd1393947a..44eb433e9666 100644 --- a/tools/testing/selftests/landlock/audit.h +++ b/tools/testing/selftests/landlock/audit.h @@ -20,7 +20,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 9acecae36f51..230b75f6015b 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -17,7 +17,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "wrappers.h" #define TMP_DIR "tmp" diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index a448fae57831..f02cc8a2e4ae 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -199,6 +199,9 @@ clean: $(if $(TEST_GEN_MODS_DIR),clean_mods_dir) # Build with _GNU_SOURCE by default CFLAGS += -D_GNU_SOURCE= +# Additional include paths needed by kselftest.h and local headers +CFLAGS += -I${top_srcdir}/tools/testing/selftests + # Enables to extend CFLAGS and LDFLAGS from command line, e.g. # make USERCFLAGS=-Werror USERLDFLAGS=-static CFLAGS += $(USERCFLAGS) diff --git a/tools/testing/selftests/lsm/lsm_get_self_attr_test.c b/tools/testing/selftests/lsm/lsm_get_self_attr_test.c index df215e4aa63f..60caf8528f81 100644 --- a/tools/testing/selftests/lsm/lsm_get_self_attr_test.c +++ b/tools/testing/selftests/lsm/lsm_get_self_attr_test.c @@ -13,7 +13,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "common.h" static struct lsm_ctx *next_ctx(struct lsm_ctx *ctxp) diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c index 1cc8a977c711..54d59044ace1 100644 --- a/tools/testing/selftests/lsm/lsm_list_modules_test.c +++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c @@ -12,7 +12,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "common.h" TEST(size_null_lsm_list_modules) diff --git a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c index 732e89fe99c0..dcb6f8aa772e 100644 --- a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c +++ b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c @@ -12,7 +12,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "common.h" TEST(ctx_null_lsm_set_self_attr) diff --git a/tools/testing/selftests/media_tests/media_device_open.c b/tools/testing/selftests/media_tests/media_device_open.c index 93183a37b133..4396bf2273a4 100644 --- a/tools/testing/selftests/media_tests/media_device_open.c +++ b/tools/testing/selftests/media_tests/media_device_open.c @@ -34,7 +34,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(int argc, char **argv) { diff --git a/tools/testing/selftests/media_tests/media_device_test.c b/tools/testing/selftests/media_tests/media_device_test.c index 4b9953359e40..6e4a8090a0eb 100644 --- a/tools/testing/selftests/media_tests/media_device_test.c +++ b/tools/testing/selftests/media_tests/media_device_test.c @@ -39,7 +39,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(int argc, char **argv) { diff --git a/tools/testing/selftests/membarrier/membarrier_test_impl.h b/tools/testing/selftests/membarrier/membarrier_test_impl.h index af89855adb7b..f6d7c44b2288 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_impl.h +++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static int registrations; diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index 17ed3e9917ca..cdd022c1c497 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -15,8 +15,8 @@ #include #include -#include "../kselftest.h" -#include "../kselftest_harness.h" +#include "kselftest.h" +#include "kselftest_harness.h" /* Default test file size: 4MB */ #define MB (1UL << 20) diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 9bc4591c7b16..30209c40b697 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MAP_SIZE_MB 100 #define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 6560c26f47d1..accfd198dbda 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -27,7 +27,7 @@ #endif /* LOCAL_CONFIG_HAVE_LIBURING */ #include "../../../../mm/gup_test.h" -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #include "thp_settings.h" diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c index f3d9ecf96890..44940f75c461 100644 --- a/tools/testing/selftests/mm/droppable.c +++ b/tools/testing/selftests/mm/droppable.c @@ -13,7 +13,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(int argc, char *argv[]) { diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index 8dd81c0a4a5a..5d65ef054d4e 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #define _GNU_SOURCE -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include /* Force the import of the tools version. */ #include #include diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 268dadb8ce43..6279893a0adc 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -27,7 +27,7 @@ #endif /* LOCAL_CONFIG_HAVE_LIBURING */ #include "../../../../mm/gup_test.h" -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" static size_t pagesize; diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 8900b840c17a..68eba95b23ff 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -12,7 +12,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #define MB (1UL << 20) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 15aadaf24a66..ae5a58ff0888 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -10,7 +10,7 @@ * bugs. */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c index 3b1b532f1cbb..d543419de040 100644 --- a/tools/testing/selftests/mm/hugepage-mmap.c +++ b/tools/testing/selftests/mm/hugepage-mmap.c @@ -15,7 +15,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c index 2bd1dac75c3f..b8f7d92e5a35 100644 --- a/tools/testing/selftests/mm/hugepage-mremap.c +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -24,7 +24,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #define DEFAULT_LENGTH_MB 10UL diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index c5940c0595be..05d9d2805ae4 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -19,7 +19,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #define MIN_FREE_PAGES 20 #define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c index ba6cc6f9cabc..46230462ad48 100644 --- a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define PREFIX " ... " #define ERROR_PREFIX " !!! " diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c index f086f0e04756..a8bc02688085 100644 --- a/tools/testing/selftests/mm/hugetlb-soft-offline.c +++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c @@ -24,7 +24,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #ifndef MADV_SOFT_OFFLINE #define MADV_SOFT_OFFLINE 101 diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index db63abe5ee5e..9ac62eb4c97d 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -18,7 +18,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) { diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index e2640529dbb2..b4b257775b74 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -9,7 +9,7 @@ #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #define INLOOP_ITER 100 diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index 8f122a0f0828..efd774b41389 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -25,7 +25,7 @@ #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #define INLOOP_ITER 100 diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index ac136f04b8d6..f1fcd9e2e91d 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -21,7 +21,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #define KiB 1024u diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index b77462b5c240..a0b48b839d54 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -12,7 +12,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include #include "vm_util.h" #include "thp_settings.h" diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index d8d11bc67ddc..88050e0f829a 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -17,7 +17,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" /* diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c index 1e9980b8993c..11241edde7fe 100644 --- a/tools/testing/selftests/mm/map_fixed_noreplace.c +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -12,7 +12,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" static void dump_maps(void) { diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c index b47399feab53..aa409107611b 100644 --- a/tools/testing/selftests/mm/map_hugetlb.c +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -11,7 +11,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c index 9df2636c829b..712327f4e932 100644 --- a/tools/testing/selftests/mm/map_populate.c +++ b/tools/testing/selftests/mm/map_populate.c @@ -16,7 +16,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c index 200bedcdc32e..647779653da0 100644 --- a/tools/testing/selftests/mm/mdwe_test.c +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef __aarch64__ # define PROT_BTI 0 diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 9a0597310a76..aac4f795c327 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -22,7 +22,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) #define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index cc4253f47f10..363c1033cc7d 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #define _GNU_SOURCE -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index ea945eebec2f..ee24b88c2b24 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -4,7 +4,7 @@ * paths in the kernel. */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "thp_settings.h" #include diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c index 09feeb453646..68dd447a5454 100644 --- a/tools/testing/selftests/mm/mkdirty.c +++ b/tools/testing/selftests/mm/mkdirty.c @@ -22,7 +22,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" static size_t pagesize; diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c index b8d7e966f44c..9d349c151360 100644 --- a/tools/testing/selftests/mm/mlock-random-test.c +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -13,7 +13,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "mlock2.h" #define CHUNK_UNIT (128 * 1024) diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 3e90ff37e336..b474f2b20def 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -7,7 +7,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "mlock2.h" struct vm_boundaries { diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c index 100370a7111d..64e8d00ae944 100644 --- a/tools/testing/selftests/mm/mrelease_test.c +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -12,7 +12,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #define MB(x) (x << 20) #define MAX_SIZE_MB 1024 diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c index 1d75084b9ca5..a4f75d836733 100644 --- a/tools/testing/selftests/mm/mremap_dontunmap.c +++ b/tools/testing/selftests/mm/mremap_dontunmap.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" unsigned long page_size; char *page_buffer; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index bf2863b102e3..a95c0663a011 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define EXPECT_SUCCESS 0 #define EXPECT_FAILURE 1 diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 005f29c86484..faad4833366a 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -8,7 +8,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include #include #include diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c index 431c1277d83a..fc4117453c84 100644 --- a/tools/testing/selftests/mm/on-fault-limit.c +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -5,7 +5,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" static void test_limit(void) { diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 4fc8e578ec7c..2cb5441f29c7 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -8,7 +8,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #include #include #include diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index 88659f0a90ea..f546dfb10cae 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -22,7 +22,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "vm_util.h" static sigjmp_buf sigjmp_buf_env; diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index fa15f006fa68..7c29f075e40b 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -16,7 +16,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" /* Define some kernel-like types */ typedef __u8 u8; diff --git a/tools/testing/selftests/mm/prctl_thp_disable.c b/tools/testing/selftests/mm/prctl_thp_disable.c index 84b4a4b345af..ca27200596a4 100644 --- a/tools/testing/selftests/mm/prctl_thp_disable.c +++ b/tools/testing/selftests/mm/prctl_thp_disable.c @@ -13,7 +13,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "thp_settings.h" #include "vm_util.h" diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c index 471cae8427f1..cd4610baf5d7 100644 --- a/tools/testing/selftests/mm/process_madv.c +++ b/tools/testing/selftests/mm/process_madv.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #define _GNU_SOURCE -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/mm/rmap.c b/tools/testing/selftests/mm/rmap.c index 13f7bccfd0a9..53f2058b0ef2 100644 --- a/tools/testing/selftests/mm/rmap.c +++ b/tools/testing/selftests/mm/rmap.c @@ -5,7 +5,7 @@ * Author(s): Wei Yang */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 4ee4db3750c1..cca857219162 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #include "thp_settings.h" diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 743af3c05190..40799f3f0213 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -20,7 +20,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" uint64_t pagesize; unsigned int pageshift; diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 4f5e290ff1a6..77813d34dcc2 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -27,7 +27,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" #if !defined(MAP_HUGETLB) #define MAP_HUGETLB 0x40000 diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c index 68201192e37c..bcad47c09518 100644 --- a/tools/testing/selftests/mm/transhuge-stress.c +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -16,7 +16,7 @@ #include #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" int backing_fd = -1; int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 37d3ca55905f..844a85ab31eb 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -35,7 +35,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c index 4e4a591cf527..17186d4a4147 100644 --- a/tools/testing/selftests/mm/uffd-wp-mremap.c +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -7,7 +7,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "thp_settings.h" #include "uffd-common.h" diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 306eba825107..02f290a69132 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -10,7 +10,7 @@ #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" /* * The hint addr value is used to allocate addresses diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 81b33d8f78f4..4f0923825ed7 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -16,7 +16,7 @@ #include #include "vm_util.h" -#include "../kselftest.h" +#include "kselftest.h" /* * Maximum address range mapped with a single mmap() diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index e33cda301dad..3869e68bb8b5 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -9,7 +9,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vm_util.h" #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 26c30fdc0241..c787db419fda 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -6,7 +6,7 @@ #include #include /* ffsl() */ #include /* _SC_PAGESIZE */ -#include "../kselftest.h" +#include "kselftest.h" #include #define BIT_ULL(nr) (1ULL << (nr)) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index a688871a98eb..7aec3ae82a44 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -21,7 +21,7 @@ #include #include "../filesystems/wrappers.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c index bcf51d785a37..12434415ec36 100644 --- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c +++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c @@ -15,7 +15,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 diff --git a/tools/testing/selftests/mqueue/mq_open_tests.c b/tools/testing/selftests/mqueue/mq_open_tests.c index 9403ac01ba11..b16029c40c0f 100644 --- a/tools/testing/selftests/mqueue/mq_open_tests.c +++ b/tools/testing/selftests/mqueue/mq_open_tests.c @@ -33,7 +33,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static char *usage = "Usage:\n" diff --git a/tools/testing/selftests/mqueue/mq_perf_tests.c b/tools/testing/selftests/mqueue/mq_perf_tests.c index fb898850867c..303c46eebd94 100644 --- a/tools/testing/selftests/mqueue/mq_perf_tests.c +++ b/tools/testing/selftests/mqueue/mq_perf_tests.c @@ -40,7 +40,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static char *usage = "Usage:\n" diff --git a/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c b/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c index 0d2af30c3bf5..cb0ca6ed7ebe 100644 --- a/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c +++ b/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c @@ -11,8 +11,8 @@ #include #include -#include "../kselftest.h" -#include "../kselftest_harness.h" +#include "kselftest.h" +#include "kselftest_harness.h" #define VMFLAGS "VmFlags:" #define MSEAL_FLAGS "sl" diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c index f1bc5773f552..064b41ad96b2 100644 --- a/tools/testing/selftests/namespaces/file_handle_test.c +++ b/tools/testing/selftests/namespaces/file_handle_test.c @@ -14,7 +14,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef FD_NSFS_ROOT #define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c index 5b6993c3740b..e4394a2fa0a9 100644 --- a/tools/testing/selftests/namespaces/init_ino_test.c +++ b/tools/testing/selftests/namespaces/init_ino_test.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" struct ns_info { const char *name; diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index e28accd74a57..bf02e68e4600 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -18,7 +18,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" TEST(nsid_mntns_basic) { diff --git a/tools/testing/selftests/nci/nci_dev.c b/tools/testing/selftests/nci/nci_dev.c index 6dec59d64083..312f84ee0444 100644 --- a/tools/testing/selftests/nci/nci_dev.c +++ b/tools/testing/selftests/nci/nci_dev.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) diff --git a/tools/testing/selftests/net/af_unix/diag_uid.c b/tools/testing/selftests/net/af_unix/diag_uid.c index 79a3dd75590e..da7d50cedee6 100644 --- a/tools/testing/selftests/net/af_unix/diag_uid.c +++ b/tools/testing/selftests/net/af_unix/diag_uid.c @@ -14,7 +14,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(diag_uid) { diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index b5f474969917..1b499d56656c 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -11,7 +11,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define BUF_SZ 32 diff --git a/tools/testing/selftests/net/af_unix/scm_inq.c b/tools/testing/selftests/net/af_unix/scm_inq.c index fc467714387e..3a86be9bda17 100644 --- a/tools/testing/selftests/net/af_unix/scm_inq.c +++ b/tools/testing/selftests/net/af_unix/scm_inq.c @@ -6,7 +6,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define NR_CHUNKS 100 #define MSG_LEN 256 diff --git a/tools/testing/selftests/net/af_unix/scm_pidfd.c b/tools/testing/selftests/net/af_unix/scm_pidfd.c index ef2921988e5f..2c18b92a2603 100644 --- a/tools/testing/selftests/net/af_unix/scm_pidfd.c +++ b/tools/testing/selftests/net/af_unix/scm_pidfd.c @@ -16,7 +16,7 @@ #include #include "../../pidfd/pidfd.h" -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define clean_errno() (errno == 0 ? "None" : strerror(errno)) #define log_err(MSG, ...) \ diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c index 914f99d153ce..d82a79c21c17 100644 --- a/tools/testing/selftests/net/af_unix/scm_rights.c +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -10,7 +10,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(scm_rights) { diff --git a/tools/testing/selftests/net/af_unix/unix_connect.c b/tools/testing/selftests/net/af_unix/unix_connect.c index d799fd8f5c7c..870ca96fa8ea 100644 --- a/tools/testing/selftests/net/af_unix/unix_connect.c +++ b/tools/testing/selftests/net/af_unix/unix_connect.c @@ -10,7 +10,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(unix_connect) { diff --git a/tools/testing/selftests/net/bind_timewait.c b/tools/testing/selftests/net/bind_timewait.c index cb9fdf51ea59..40126f9b901e 100644 --- a/tools/testing/selftests/net/bind_timewait.c +++ b/tools/testing/selftests/net/bind_timewait.c @@ -4,7 +4,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(bind_timewait) { diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index b7b54d646b93..7d11548b2c61 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -4,7 +4,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const __u32 in4addr_any = INADDR_ANY; static const __u32 in4addr_loopback = INADDR_LOOPBACK; diff --git a/tools/testing/selftests/net/can/test_raw_filter.c b/tools/testing/selftests/net/can/test_raw_filter.c index 4101c36390fd..bb8ae8854273 100644 --- a/tools/testing/selftests/net/can/test_raw_filter.c +++ b/tools/testing/selftests/net/can/test_raw_filter.c @@ -19,7 +19,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define ID 0x123 diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index ded9b925865e..67a72b1a2f3d 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" enum { ERN_SUCCESS = 0, diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c index 16e457c2f877..adf8dd0b5e0b 100644 --- a/tools/testing/selftests/net/epoll_busy_poll.c +++ b/tools/testing/selftests/net/epoll_busy_poll.c @@ -23,7 +23,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" /* if the headers haven't been updated, we need to define some things */ #if !defined(EPOLL_IOC_TYPE) diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index cfc39f70635d..76aa75469a8c 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -57,7 +57,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define DPORT 8000 #define SPORT 1500 diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c index 29451d2244b7..e6834a6cfc8f 100644 --- a/tools/testing/selftests/net/ip_local_port_range.c +++ b/tools/testing/selftests/net/ip_local_port_range.c @@ -10,7 +10,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef IP_LOCAL_PORT_RANGE #define IP_LOCAL_PORT_RANGE 51 diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index 9b44a091802c..0ccf484b1d9d 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -34,7 +34,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define printk(fmt, ...) \ ksft_print_msg("%d[%u] " fmt "\n", getpid(), __LINE__, ##__VA_ARGS__) diff --git a/tools/testing/selftests/net/ipv6_fragmentation.c b/tools/testing/selftests/net/ipv6_fragmentation.c index 267ef62b5c72..672c9fe086a7 100644 --- a/tools/testing/selftests/net/ipv6_fragmentation.c +++ b/tools/testing/selftests/net/ipv6_fragmentation.c @@ -34,7 +34,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MTU 1500 #define LARGER_THAN_MTU 8192 diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c index 5f827e10717d..5cecb8a1bc94 100644 --- a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -10,7 +10,7 @@ #include #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define TEST_ZONE_ID 123 #define NF_CT_DEFAULT_ZONE_ID 0 diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c index 7618ebe528a4..faa4455815f8 100644 --- a/tools/testing/selftests/net/netlink-dumps.c +++ b/tools/testing/selftests/net/netlink-dumps.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c index 0a5226196a2e..0f3babf19fd0 100644 --- a/tools/testing/selftests/net/ovpn/ovpn-cli.c +++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c @@ -32,7 +32,7 @@ #include -#include "../../kselftest.h" +#include "kselftest.h" /* defines to make checkpatch happy */ #define strscpy strncpy diff --git a/tools/testing/selftests/net/proc_net_pktgen.c b/tools/testing/selftests/net/proc_net_pktgen.c index 69444fb29577..fab3b5c2e25d 100644 --- a/tools/testing/selftests/net/proc_net_pktgen.c +++ b/tools/testing/selftests/net/proc_net_pktgen.c @@ -10,7 +10,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const char ctrl_cmd_stop[] = "stop"; static const char ctrl_cmd_start[] = "start"; diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 84c524357075..ab8d8b7e6cb0 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -54,7 +54,7 @@ #include #include "psock_lib.h" -#include "../kselftest.h" +#include "kselftest.h" #define RING_NUM_FRAMES 20 diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c index 2938045c5cf9..7caf3135448d 100644 --- a/tools/testing/selftests/net/psock_tpacket.c +++ b/tools/testing/selftests/net/psock_tpacket.c @@ -46,7 +46,7 @@ #include "psock_lib.h" -#include "../kselftest.h" +#include "kselftest.h" #ifndef bug_on # define bug_on(cond) assert(!(cond)) diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c index 7b9bf8a7bbe1..5aad27a0d13a 100644 --- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -22,7 +22,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" struct reuse_opts { int reuseaddr[2]; diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c index 65aea27d761c..b6634d6da3d6 100644 --- a/tools/testing/selftests/net/reuseport_bpf.c +++ b/tools/testing/selftests/net/reuseport_bpf.c @@ -24,7 +24,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" struct test_params { int recv_family; diff --git a/tools/testing/selftests/net/reuseport_bpf_numa.c b/tools/testing/selftests/net/reuseport_bpf_numa.c index c9ba36aa688e..2ffd957ffb15 100644 --- a/tools/testing/selftests/net/reuseport_bpf_numa.c +++ b/tools/testing/selftests/net/reuseport_bpf_numa.c @@ -23,7 +23,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static const int PORT = 8888; diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c index 16ac4df55fdb..b81ed0352d6c 100644 --- a/tools/testing/selftests/net/rxtimestamp.c +++ b/tools/testing/selftests/net/rxtimestamp.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" struct options { int so_timestamp; diff --git a/tools/testing/selftests/net/sk_so_peek_off.c b/tools/testing/selftests/net/sk_so_peek_off.c index d87dd8d8d491..2a3f5c604f52 100644 --- a/tools/testing/selftests/net/sk_so_peek_off.c +++ b/tools/testing/selftests/net/sk_so_peek_off.c @@ -8,7 +8,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" static char *afstr(int af, int proto) { diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c index e9fa14e10732..4740701f1a9a 100644 --- a/tools/testing/selftests/net/so_incoming_cpu.c +++ b/tools/testing/selftests/net/so_incoming_cpu.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(so_incoming_cpu) { diff --git a/tools/testing/selftests/net/socket.c b/tools/testing/selftests/net/socket.c index be1080003c61..9e270548dad8 100644 --- a/tools/testing/selftests/net/socket.c +++ b/tools/testing/selftests/net/socket.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" struct socket_testcase { int domain; diff --git a/tools/testing/selftests/net/tap.c b/tools/testing/selftests/net/tap.c index 247c3b3ac1c9..9ec1c9b50e77 100644 --- a/tools/testing/selftests/net/tap.c +++ b/tools/testing/selftests/net/tap.c @@ -17,7 +17,7 @@ #include #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const char param_dev_tap_name[] = "xmacvtap0"; static const char param_dev_dummy_name[] = "xdummy0"; diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index a27cc03c9fbd..49aec2922a31 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -9,7 +9,7 @@ * Can't be included in the header: it defines static variables which * will be unique to every object. Let's include it only once here. */ -#include "../../../kselftest.h" +#include "kselftest.h" /* Prevent overriding of one thread's output by another */ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; diff --git a/tools/testing/selftests/net/tcp_fastopen_backup_key.c b/tools/testing/selftests/net/tcp_fastopen_backup_key.c index c1cb0c75156a..4b3f9b5e50fe 100644 --- a/tools/testing/selftests/net/tcp_fastopen_backup_key.c +++ b/tools/testing/selftests/net/tcp_fastopen_backup_key.c @@ -26,7 +26,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #ifndef TCP_FASTOPEN_KEY #define TCP_FASTOPEN_KEY 33 diff --git a/tools/testing/selftests/net/tcp_port_share.c b/tools/testing/selftests/net/tcp_port_share.c index 4c39d599dfce..6146b62610df 100644 --- a/tools/testing/selftests/net/tcp_port_share.c +++ b/tools/testing/selftests/net/tcp_port_share.c @@ -10,7 +10,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define DST_PORT 30000 #define SRC_PORT 40000 diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 5c6d8215021c..2b6590c0e13c 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -21,7 +21,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define TLS_PAYLOAD_MAX_LEN 16384 #define SOL_TLS 282 diff --git a/tools/testing/selftests/net/toeplitz.c b/tools/testing/selftests/net/toeplitz.c index 9ba03164d73a..4b58152d5a49 100644 --- a/tools/testing/selftests/net/toeplitz.c +++ b/tools/testing/selftests/net/toeplitz.c @@ -52,7 +52,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define TOEPLITZ_KEY_MIN_LEN 40 #define TOEPLITZ_KEY_MAX_LEN 60 diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index fa83918b62d1..0efc67b0357a 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -15,7 +15,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static int tun_attach(int fd, char *dev) { diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c index 477392715a9a..86d80cce55b4 100644 --- a/tools/testing/selftests/net/udpgso_bench_tx.c +++ b/tools/testing/selftests/net/udpgso_bench_tx.c @@ -25,7 +25,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #ifndef ETH_MAX_MTU #define ETH_MAX_MTU 0xFFFFU diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h index 7056340b9339..510e60602511 100644 --- a/tools/testing/selftests/openat2/helpers.h +++ b/tools/testing/selftests/openat2/helpers.h @@ -12,7 +12,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define ARRAY_LEN(X) (sizeof (X) / sizeof (*(X))) #define BUILD_BUG_ON(e) ((void)(sizeof(struct { int:(-!!(e)); }))) diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c index 5790ab446527..0e161ef9e9e4 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/openat2/openat2_test.c @@ -15,7 +15,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "helpers.h" /* diff --git a/tools/testing/selftests/openat2/rename_attack_test.c b/tools/testing/selftests/openat2/rename_attack_test.c index 0a770728b436..aa5699e45729 100644 --- a/tools/testing/selftests/openat2/rename_attack_test.c +++ b/tools/testing/selftests/openat2/rename_attack_test.c @@ -22,7 +22,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "helpers.h" /* Construct a test directory with the following structure: diff --git a/tools/testing/selftests/openat2/resolve_test.c b/tools/testing/selftests/openat2/resolve_test.c index bbafad440893..a76ef15ceb90 100644 --- a/tools/testing/selftests/openat2/resolve_test.c +++ b/tools/testing/selftests/openat2/resolve_test.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "helpers.h" /* diff --git a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c index cd9075444c32..23aac6f97061 100644 --- a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c +++ b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c @@ -20,7 +20,7 @@ #include "../../../../include/uapi/linux/pcitest.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define pci_ep_ioctl(cmd, arg) \ ({ \ diff --git a/tools/testing/selftests/perf_events/mmap.c b/tools/testing/selftests/perf_events/mmap.c index ea0427aac1f9..d1fa8ec58987 100644 --- a/tools/testing/selftests/perf_events/mmap.c +++ b/tools/testing/selftests/perf_events/mmap.c @@ -14,7 +14,7 @@ #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define RB_SIZE 0x3000 #define AUX_SIZE 0x10000 diff --git a/tools/testing/selftests/perf_events/remove_on_exec.c b/tools/testing/selftests/perf_events/remove_on_exec.c index 5814611a1dc7..89e7b06835df 100644 --- a/tools/testing/selftests/perf_events/remove_on_exec.c +++ b/tools/testing/selftests/perf_events/remove_on_exec.c @@ -30,7 +30,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static volatile int signal_count; diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index d1d8483ac628..b5cf8355345d 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -31,7 +31,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define NUM_THREADS 5 diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c index b3a72f0ac522..0f64b9b17081 100644 --- a/tools/testing/selftests/perf_events/watermark_signal.c +++ b/tools/testing/selftests/perf_events/watermark_signal.c @@ -15,7 +15,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static int sigio_count; diff --git a/tools/testing/selftests/pid_namespace/pid_max.c b/tools/testing/selftests/pid_namespace/pid_max.c index 96f274f0582b..c9519e7385b6 100644 --- a/tools/testing/selftests/pid_namespace/pid_max.c +++ b/tools/testing/selftests/pid_namespace/pid_max.c @@ -13,7 +13,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../pidfd/pidfd.h" #define __STACK_SIZE (8 * 1024 * 1024) diff --git a/tools/testing/selftests/pid_namespace/regression_enomem.c b/tools/testing/selftests/pid_namespace/regression_enomem.c index 7d84097ad45c..059e7ec5b4fd 100644 --- a/tools/testing/selftests/pid_namespace/regression_enomem.c +++ b/tools/testing/selftests/pid_namespace/regression_enomem.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../pidfd/pidfd.h" /* diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index f87993def738..db80015ab00b 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -25,7 +25,7 @@ #undef SCHED_FLAG_KEEP_ALL #undef SCHED_FLAG_UTIL_CLAMP -#include "../kselftest.h" +#include "kselftest.h" #include "../clone3/clone3_selftests.h" #ifndef FD_PIDFS_ROOT diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c index c094aeb1c620..1fdf49939524 100644 --- a/tools/testing/selftests/pidfd/pidfd_bind_mount.c +++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c @@ -14,7 +14,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../filesystems/wrappers.h" FIXTURE(pidfd_bind_mount) { diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index f718aac75068..9935e9471c77 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -16,7 +16,7 @@ #include #include "pidfd.h" -#include "../kselftest.h" +#include "kselftest.h" struct error { int code; diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c index 6bd2e9c9565b..68918734dcf3 100644 --- a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c +++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c @@ -20,7 +20,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(file_handle) { diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c index cd51d547b751..ea45b37001b0 100644 --- a/tools/testing/selftests/pidfd/pidfd_getfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c @@ -19,7 +19,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" /* * UNKNOWN_FD is an fd number that should never exist in the child, as it is diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index a0eb6e81eaa2..91b216baff07 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -21,7 +21,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(pidfd_info) { diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index cd3de40e4977..318e6f09c8e0 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -20,7 +20,7 @@ #include #include "pidfd.h" -#include "../kselftest.h" +#include "kselftest.h" static int safe_int(const char *numstr, int *converted) { diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c index 55d74a50358f..232304f818c7 100644 --- a/tools/testing/selftests/pidfd/pidfd_poll_test.c +++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c @@ -14,7 +14,7 @@ #include #include "pidfd.h" -#include "../kselftest.h" +#include "kselftest.h" static bool timeout; diff --git a/tools/testing/selftests/pidfd/pidfd_setattr_test.c b/tools/testing/selftests/pidfd/pidfd_setattr_test.c index d7de05edc4b3..e8562a2992f3 100644 --- a/tools/testing/selftests/pidfd/pidfd_setattr_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setattr_test.c @@ -22,7 +22,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(pidfs_setattr) { diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index e6a079b3d5e2..107edecff224 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -18,7 +18,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" enum { PIDFD_NS_USER, diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index fcd85cad9f18..932cbd8caa77 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -20,7 +20,7 @@ #include #include "pidfd.h" -#include "../kselftest.h" +#include "kselftest.h" #define str(s) _str(s) #define _str(s) #s diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c index 1e2d49751cde..4bf702d62c1c 100644 --- a/tools/testing/selftests/pidfd/pidfd_wait.c +++ b/tools/testing/selftests/pidfd/pidfd_wait.c @@ -17,7 +17,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) diff --git a/tools/testing/selftests/pidfd/pidfd_xattr_test.c b/tools/testing/selftests/pidfd/pidfd_xattr_test.c index 5cf7bb0e4bf2..fd57511af7e4 100644 --- a/tools/testing/selftests/pidfd/pidfd_xattr_test.c +++ b/tools/testing/selftests/pidfd/pidfd_xattr_test.c @@ -22,7 +22,7 @@ #include #include "pidfd.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" FIXTURE(pidfs_xattr) { diff --git a/tools/testing/selftests/prctl/set-anon-vma-name-test.c b/tools/testing/selftests/prctl/set-anon-vma-name-test.c index 4275cb256dce..ac6721b184a6 100644 --- a/tools/testing/selftests/prctl/set-anon-vma-name-test.c +++ b/tools/testing/selftests/prctl/set-anon-vma-name-test.c @@ -10,7 +10,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define AREA_SIZE 1024 diff --git a/tools/testing/selftests/prctl/set-process-name.c b/tools/testing/selftests/prctl/set-process-name.c index 562f707ba771..3f7b146d36df 100644 --- a/tools/testing/selftests/prctl/set-process-name.c +++ b/tools/testing/selftests/prctl/set-process-name.c @@ -7,7 +7,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define CHANGE_NAME "changename" #define EMPTY_NAME "" diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index a546475db550..a734553718da 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -23,7 +23,7 @@ * */ #define _GNU_SOURCE -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index 978cbcb3eb11..4e6a3e53f975 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -51,7 +51,7 @@ #define __maybe_unused __attribute__((__unused__)) #endif -#include "../kselftest.h" +#include "kselftest.h" static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) { diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c index 52500597f951..25b9a2933c45 100644 --- a/tools/testing/selftests/proc/proc-pidns.c +++ b/tools/testing/selftests/proc/proc-pidns.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define ASSERT_ERRNO(expected, _t, seen) \ __EXPECT(expected, #expected, \ diff --git a/tools/testing/selftests/ptrace/get_set_sud.c b/tools/testing/selftests/ptrace/get_set_sud.c index 5297b10d25c3..2e619c7599bb 100644 --- a/tools/testing/selftests/ptrace/get_set_sud.c +++ b/tools/testing/selftests/ptrace/get_set_sud.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/ptrace/get_syscall_info.c b/tools/testing/selftests/ptrace/get_syscall_info.c index 5bcd1c7b5be6..3f5c3a9fdaba 100644 --- a/tools/testing/selftests/ptrace/get_syscall_info.c +++ b/tools/testing/selftests/ptrace/get_syscall_info.c @@ -7,7 +7,7 @@ * matches userspace expectations. */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/ptrace/set_syscall_info.c b/tools/testing/selftests/ptrace/set_syscall_info.c index 4198248ef874..1cc411a41cd6 100644 --- a/tools/testing/selftests/ptrace/set_syscall_info.c +++ b/tools/testing/selftests/ptrace/set_syscall_info.c @@ -7,7 +7,7 @@ * matches userspace expectations. */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..3801b5831527 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -7,7 +7,7 @@ * when de_thread is blocked with ->cred_guard_mutex held. */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include #include #include diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index cd3adfc14969..3c51bdac2dfa 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -23,7 +23,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MB (1024 * 1024) #define RESCTRL_PATH "/sys/fs/resctrl" diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c index a58f520f2f41..f24677737066 100644 --- a/tools/testing/selftests/ring-buffer/map_test.c +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -17,7 +17,7 @@ #include #include "../user_events/user_events_selftests.h" /* share tracefs setup */ -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define TRACEFS_ROOT "/sys/kernel/tracing" diff --git a/tools/testing/selftests/riscv/abi/pointer_masking.c b/tools/testing/selftests/riscv/abi/pointer_masking.c index 059d2e87eb1f..2d540af7b558 100644 --- a/tools/testing/selftests/riscv/abi/pointer_masking.c +++ b/tools/testing/selftests/riscv/abi/pointer_masking.c @@ -9,7 +9,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #ifndef PR_PMLEN_SHIFT #define PR_PMLEN_SHIFT 24 diff --git a/tools/testing/selftests/riscv/hwprobe/cbo.c b/tools/testing/selftests/riscv/hwprobe/cbo.c index 5e96ef785d0d..f5208ca80905 100644 --- a/tools/testing/selftests/riscv/hwprobe/cbo.c +++ b/tools/testing/selftests/riscv/hwprobe/cbo.c @@ -17,7 +17,7 @@ #include #include "hwprobe.h" -#include "../../kselftest.h" +#include "kselftest.h" #define MK_CBO(fn) le32_bswap((uint32_t)(fn) << 20 | 10 << 15 | 2 << 12 | 0 << 7 | 15) diff --git a/tools/testing/selftests/riscv/hwprobe/hwprobe.c b/tools/testing/selftests/riscv/hwprobe/hwprobe.c index fd73c87804f3..54c435af9923 100644 --- a/tools/testing/selftests/riscv/hwprobe/hwprobe.c +++ b/tools/testing/selftests/riscv/hwprobe/hwprobe.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include "hwprobe.h" -#include "../../kselftest.h" +#include "kselftest.h" int main(int argc, char **argv) { diff --git a/tools/testing/selftests/riscv/hwprobe/which-cpus.c b/tools/testing/selftests/riscv/hwprobe/which-cpus.c index 82c121412dfc..3ab53067e8dd 100644 --- a/tools/testing/selftests/riscv/hwprobe/which-cpus.c +++ b/tools/testing/selftests/riscv/hwprobe/which-cpus.c @@ -14,7 +14,7 @@ #include #include "hwprobe.h" -#include "../../kselftest.h" +#include "kselftest.h" static void help(void) { diff --git a/tools/testing/selftests/riscv/mm/mmap_bottomup.c b/tools/testing/selftests/riscv/mm/mmap_bottomup.c index f9ccae50349b..461a65c9be00 100644 --- a/tools/testing/selftests/riscv/mm/mmap_bottomup.c +++ b/tools/testing/selftests/riscv/mm/mmap_bottomup.c @@ -2,7 +2,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" TEST(infinite_rlimit) { diff --git a/tools/testing/selftests/riscv/mm/mmap_default.c b/tools/testing/selftests/riscv/mm/mmap_default.c index 3f53b6ecc326..58db7d172af2 100644 --- a/tools/testing/selftests/riscv/mm/mmap_default.c +++ b/tools/testing/selftests/riscv/mm/mmap_default.c @@ -2,7 +2,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" TEST(default_rlimit) { diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h index 75918d15919f..266a6becdeba 100644 --- a/tools/testing/selftests/riscv/mm/mmap_test.h +++ b/tools/testing/selftests/riscv/mm/mmap_test.h @@ -5,7 +5,7 @@ #include #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define TOP_DOWN 0 #define BOTTOM_UP 1 diff --git a/tools/testing/selftests/riscv/sigreturn/sigreturn.c b/tools/testing/selftests/riscv/sigreturn/sigreturn.c index ed351a1cb917..e10873d95fed 100644 --- a/tools/testing/selftests/riscv/sigreturn/sigreturn.c +++ b/tools/testing/selftests/riscv/sigreturn/sigreturn.c @@ -4,7 +4,7 @@ #include #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #define RISCV_V_MAGIC 0x53465457 #define DEFAULT_VALUE 2 diff --git a/tools/testing/selftests/riscv/vector/v_initval.c b/tools/testing/selftests/riscv/vector/v_initval.c index be9e1d18ad29..5fd2382e15a2 100644 --- a/tools/testing/selftests/riscv/vector/v_initval.c +++ b/tools/testing/selftests/riscv/vector/v_initval.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #include "v_helpers.h" #define NEXT_PROGRAM "./v_exec_initval_nolibc" diff --git a/tools/testing/selftests/riscv/vector/vstate_prctl.c b/tools/testing/selftests/riscv/vector/vstate_prctl.c index 62fbb17a0556..d607af3900c1 100644 --- a/tools/testing/selftests/riscv/vector/vstate_prctl.c +++ b/tools/testing/selftests/riscv/vector/vstate_prctl.c @@ -6,7 +6,7 @@ #include #include -#include "../../kselftest_harness.h" +#include "kselftest_harness.h" #include "v_helpers.h" #define NEXT_PROGRAM "./vstate_exec_nolibc" diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c index 2348d2c20d0a..1193612bf327 100644 --- a/tools/testing/selftests/rseq/basic_percpu_ops_test.c +++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "rseq.h" #ifdef BUILDOPT_RSEQ_PERCPU_MM_CID diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index dcac5cbe7933..a736727b83c1 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -33,7 +33,7 @@ #include -#include "../kselftest.h" +#include "kselftest.h" #include "rseq.h" /* diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c index be175c0e6ae3..8047d9879039 100644 --- a/tools/testing/selftests/rtc/rtctest.c +++ b/tools/testing/selftests/rtc/rtctest.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define NUM_UIE 3 #define ALARM_DELTA 3 diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 5822e25e0217..ea4068cdefd6 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -20,7 +20,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" unsigned long long timing(clockid_t clk_id, unsigned long long samples) { diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 874f17763536..32e2d4df397b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -54,7 +54,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "../clone3/clone3_selftests.h" /* Attempt to de-conflict with the selftests tree. */ diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 9820b3809c69..13b84e54ce38 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -18,7 +18,7 @@ #include #include #include "defines.h" -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "main.h" static const uint64_t MAGIC = 0x1122334455667788ULL; diff --git a/tools/testing/selftests/signal/mangle_uc_sigmask.c b/tools/testing/selftests/signal/mangle_uc_sigmask.c index b79ab92178a8..11dbc14bbc8e 100644 --- a/tools/testing/selftests/signal/mangle_uc_sigmask.c +++ b/tools/testing/selftests/signal/mangle_uc_sigmask.c @@ -39,7 +39,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" void handler_verify_ucontext(int signo, siginfo_t *info, void *uc) { diff --git a/tools/testing/selftests/signal/sas.c b/tools/testing/selftests/signal/sas.c index 07227fab1cc9..306b996ab365 100644 --- a/tools/testing/selftests/signal/sas.c +++ b/tools/testing/selftests/signal/sas.c @@ -19,7 +19,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "current_stack_pointer.h" #ifndef SS_AUTODISARM diff --git a/tools/testing/selftests/sparc64/drivers/adi-test.c b/tools/testing/selftests/sparc64/drivers/adi-test.c index 84e5d9fd20b0..b986714e7a52 100644 --- a/tools/testing/selftests/sparc64/drivers/adi-test.c +++ b/tools/testing/selftests/sparc64/drivers/adi-test.c @@ -16,7 +16,7 @@ #include #include -#include "../../kselftest.h" +#include "kselftest.h" #define DEBUG_LEVEL_1_BIT (0x0001) #define DEBUG_LEVEL_2_BIT (0x0002) diff --git a/tools/testing/selftests/sync/sync_test.c b/tools/testing/selftests/sync/sync_test.c index 93db5aa246a3..2b44e5d88b63 100644 --- a/tools/testing/selftests/sync/sync_test.c +++ b/tools/testing/selftests/sync/sync_test.c @@ -34,7 +34,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "synctest.h" static int run_test(int (*test)(void), char *name) diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c index 2eb2c06303f2..b855c6000287 100644 --- a/tools/testing/selftests/syscall_user_dispatch/sud_test.c +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #ifndef PR_SET_SYSCALL_USER_DISPATCH # define PR_SET_SYSCALL_USER_DISPATCH 59 diff --git a/tools/testing/selftests/tdx/tdx_guest_test.c b/tools/testing/selftests/tdx/tdx_guest_test.c index 81d8cb88ea1a..dfaefa685519 100644 --- a/tools/testing/selftests/tdx/tdx_guest_test.c +++ b/tools/testing/selftests/tdx/tdx_guest_test.c @@ -13,7 +13,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define TDX_GUEST_DEVNAME "/dev/tdx_guest" #define HEX_DUMP_SIZE 8 diff --git a/tools/testing/selftests/timens/timens.h b/tools/testing/selftests/timens/timens.h index d4fc52d47146..7ca4b46ca61d 100644 --- a/tools/testing/selftests/timens/timens.h +++ b/tools/testing/selftests/timens/timens.h @@ -7,7 +7,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #ifndef CLONE_NEWTIME # define CLONE_NEWTIME 0x00000080 diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c index 777d9494b683..5b3ef708d6e9 100644 --- a/tools/testing/selftests/timers/adjtick.c +++ b/tools/testing/selftests/timers/adjtick.c @@ -24,7 +24,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MILLION 1000000 diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index a9ef76ea6051..aa66c805f6a4 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -30,7 +30,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */ diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index 18e794a46c23..387fda10fcd8 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -28,7 +28,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" int change_skew_test(int ppm) { diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index 83faa4e354e3..db62a764c29e 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -34,7 +34,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" int get_clocksources(char list[][30]) diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c index 73b636f89fdc..cfa46dafe3e8 100644 --- a/tools/testing/selftests/timers/freq-step.c +++ b/tools/testing/selftests/timers/freq-step.c @@ -15,7 +15,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define SAMPLES 100 #define SAMPLE_READINGS 10 diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c index 9d1573769d55..e53e63e18683 100644 --- a/tools/testing/selftests/timers/inconsistency-check.c +++ b/tools/testing/selftests/timers/inconsistency-check.c @@ -29,7 +29,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ #define CLOCK_HWSPECIFIC 10 diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 04004a7c0934..3568cfb3e815 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -49,7 +49,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define CLOCK_TAI 11 diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c index 8fd065eec904..c2d3bccb52f2 100644 --- a/tools/testing/selftests/timers/leapcrash.c +++ b/tools/testing/selftests/timers/leapcrash.c @@ -22,7 +22,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" /* clear NTP time_status & time_state */ int clear_time_state(void) diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c index 63de2334a291..c0d9368e4fca 100644 --- a/tools/testing/selftests/timers/mqueue-lat.c +++ b/tools/testing/selftests/timers/mqueue-lat.c @@ -30,7 +30,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define TARGET_TIMEOUT 100000000 /* 100ms in nanoseconds */ diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 252c6308c569..7a29913aaf03 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -28,7 +28,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ #define CLOCK_HWSPECIFIC 10 diff --git a/tools/testing/selftests/timers/nsleep-lat.c b/tools/testing/selftests/timers/nsleep-lat.c index de23dc0c9f97..a7ba1eb1e21b 100644 --- a/tools/testing/selftests/timers/nsleep-lat.c +++ b/tools/testing/selftests/timers/nsleep-lat.c @@ -25,7 +25,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */ diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index f0eceb0faf34..5cbb3e81f35d 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -19,7 +19,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define DELAY 2 diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 957f7cd29cb1..a7bae7d80916 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -26,7 +26,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define shift_right(x, s) ({ \ __typeof__(x) __x = (x); \ diff --git a/tools/testing/selftests/timers/rtcpie.c b/tools/testing/selftests/timers/rtcpie.c index 7c07edd0d450..4ba42d198b7e 100644 --- a/tools/testing/selftests/timers/rtcpie.c +++ b/tools/testing/selftests/timers/rtcpie.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" /* * This expects the new RTC class driver framework, working with diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c index ed244315e11c..ecc171de4728 100644 --- a/tools/testing/selftests/timers/set-2038.c +++ b/tools/testing/selftests/timers/set-2038.c @@ -28,7 +28,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define KTIME_MAX ((long long)~((unsigned long long)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c index 5b67462efcd6..e283c04284af 100644 --- a/tools/testing/selftests/timers/set-tai.c +++ b/tools/testing/selftests/timers/set-tai.c @@ -23,7 +23,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" int set_tai(int offset) { diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 9d8437c13929..44d2e3614fa5 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -29,7 +29,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ #define CLOCK_HWSPECIFIC 10 diff --git a/tools/testing/selftests/timers/set-tz.c b/tools/testing/selftests/timers/set-tz.c index 20daaf1782b7..334f36c0eda6 100644 --- a/tools/testing/selftests/timers/set-tz.c +++ b/tools/testing/selftests/timers/set-tz.c @@ -23,7 +23,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" int set_tz(int min, int dst) { diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 46c391d7f45d..53ee5d710ff4 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -34,7 +34,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(int argc, char **argv) { diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index d5564bbf0e50..60b8b21bf782 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -21,7 +21,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" /* serializes shared list access */ pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER; diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 6b7801055ad1..e1e56d3097d6 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -30,7 +30,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define ADJ_SETOFFSET 0x0100 diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c index 02ecfe687dc2..5cb4e404a2bd 100644 --- a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c +++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c @@ -23,7 +23,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" int main(void) { diff --git a/tools/testing/selftests/tty/tty_tstamp_update.c b/tools/testing/selftests/tty/tty_tstamp_update.c index 9e1a40f5db17..bc3291dcd18b 100644 --- a/tools/testing/selftests/tty/tty_tstamp_update.c +++ b/tools/testing/selftests/tty/tty_tstamp_update.c @@ -9,7 +9,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #define MIN_TTY_PATH_LEN 8 diff --git a/tools/testing/selftests/uevent/uevent_filtering.c b/tools/testing/selftests/uevent/uevent_filtering.c index dbe55f3a66f4..974b076f9235 100644 --- a/tools/testing/selftests/uevent/uevent_filtering.c +++ b/tools/testing/selftests/uevent/uevent_filtering.c @@ -19,7 +19,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #define __DEV_FULL "/sys/devices/virtual/mem/full/uevent" #define __UEVENT_BUFFER_SIZE (2048 * 2) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index 7288a05136ba..85892b3b719c 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -20,7 +20,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "user_events_selftests.h" const char *data_file = "/sys/kernel/tracing/user_events_data"; diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index 54c9412f8dee..78e3c33f4015 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -14,7 +14,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "user_events_selftests.h" const char *dyn_file = "/sys/kernel/tracing/dynamic_events"; diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 0bb46793dcd4..decce06b9ba8 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -15,7 +15,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "user_events_selftests.h" const char *data_file = "/sys/kernel/tracing/user_events_data"; diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 68625362add2..cafec0e52eb3 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -16,7 +16,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" #include "user_events_selftests.h" const char *data_file = "/sys/kernel/tracing/user_events_data"; diff --git a/tools/testing/selftests/user_events/user_events_selftests.h b/tools/testing/selftests/user_events/user_events_selftests.h index e1c3c063c031..3b5d37e46f8a 100644 --- a/tools/testing/selftests/user_events/user_events_selftests.h +++ b/tools/testing/selftests/user_events/user_events_selftests.h @@ -9,7 +9,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" static inline void tracefs_unmount(void) { diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 238d609a457a..c620317eaeea 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "vdso_config.h" #include "vdso_call.h" #include "parse_vdso.h" diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c index 0aad682b12c8..9a5c9c05e09c 100644 --- a/tools/testing/selftests/vDSO/vdso_test_chacha.c +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c @@ -10,7 +10,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #if defined(__aarch64__) static bool cpu_has_capabilities(void) diff --git a/tools/testing/selftests/vDSO/vdso_test_correctness.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c index da651cf53c6c..055af95aa552 100644 --- a/tools/testing/selftests/vDSO/vdso_test_correctness.c +++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c @@ -21,7 +21,7 @@ #include "vdso_config.h" #include "vdso_call.h" -#include "../kselftest.h" +#include "kselftest.h" static const char **name; diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index cdeaed45fb26..bea8ad54da11 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "parse_vdso.h" #include "vdso_config.h" #include "vdso_call.h" diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c index dd1132508a0d..ef402001e898 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -23,7 +23,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #include "parse_vdso.h" #include "vdso_config.h" #include "vdso_call.h" diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index 9ce795b806f0..912edadad92c 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -16,7 +16,7 @@ #include #endif -#include "../kselftest.h" +#include "kselftest.h" #include "parse_vdso.h" #include "vdso_config.h" #include "vdso_call.h" diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 240409bf5f8a..fd9e86a73adc 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -8,7 +8,7 @@ #include #include -#include "../../../kselftest.h" +#include "kselftest.h" #define VFIO_LOG_AND_EXIT(...) do { \ fprintf(stderr, " " __VA_ARGS__); \ diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index a381fd253aa7..13e351f41c74 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -18,7 +18,7 @@ #include #include -#include "../../../kselftest.h" +#include "kselftest.h" #include #define PCI_SYSFS_PATH "/sys/bus/pci/devices" diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index e5e8723ecb41..fb958d895c17 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include -#include "../../../kselftest.h" +#include "kselftest.h" #include #ifdef __x86_64__ diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 4f1ea79a200c..b82f215aa1c2 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -10,7 +10,7 @@ #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const char *device_bdf; diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c index 3655106b912d..6854a9dedff3 100644 --- a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c +++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c @@ -11,7 +11,7 @@ #include #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const char iommu_dev_path[] = "/dev/iommu"; static const char *cdev_path; diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 7a270698e4d2..164c5f4b3903 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -12,7 +12,7 @@ #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const char *device_bdf; diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 2dbd70b7db62..d962fd77987a 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -7,7 +7,7 @@ #include -#include "../kselftest_harness.h" +#include "kselftest_harness.h" static const char *device_bdf; diff --git a/tools/testing/selftests/x86/corrupt_xstate_header.c b/tools/testing/selftests/x86/corrupt_xstate_header.c index 93a89a5997ca..f4d67b050275 100644 --- a/tools/testing/selftests/x86/corrupt_xstate_header.c +++ b/tools/testing/selftests/x86/corrupt_xstate_header.c @@ -17,7 +17,7 @@ #include #include -#include "../kselftest.h" /* For __cpuid_count() */ +#include "kselftest.h" /* For __cpuid_count() */ #include "helpers.h" static inline int xsave_enabled(void) diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h index 6deaad035161..4c747a1278d9 100644 --- a/tools/testing/selftests/x86/helpers.h +++ b/tools/testing/selftests/x86/helpers.h @@ -7,7 +7,7 @@ #include -#include "../kselftest.h" +#include "kselftest.h" static inline unsigned long get_eflags(void) { diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 0873b0e5f48b..1919fa6daec0 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -18,7 +18,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" #ifndef __x86_64__ # error This test is 64-bit only diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index 41c42b7b54a6..ca0eca7b9dce 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -25,7 +25,7 @@ #include #include -#include "../kselftest.h" +#include "kselftest.h" /* Common system call numbers */ #define SYS_READ 0 diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c index 94bee6e0c813..a5edf6c5f17e 100644 --- a/tools/testing/selftests/x86/test_mremap_vdso.c +++ b/tools/testing/selftests/x86/test_mremap_vdso.c @@ -20,7 +20,7 @@ #include #include #include -#include "../kselftest.h" +#include "kselftest.h" #define PAGE_SIZE 4096 diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 05e1e6774fba..de55bb0992b2 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -21,7 +21,7 @@ #include #include "helpers.h" -#include "../kselftest.h" +#include "kselftest.h" #ifdef __x86_64__ #define TOTAL_TESTS 13 diff --git a/tools/testing/selftests/x86/xstate.h b/tools/testing/selftests/x86/xstate.h index e91e3092b5d2..6ee816e7625a 100644 --- a/tools/testing/selftests/x86/xstate.h +++ b/tools/testing/selftests/x86/xstate.h @@ -4,7 +4,7 @@ #include -#include "../kselftest.h" +#include "kselftest.h" #define XSAVE_HDR_OFFSET 512 #define XSAVE_HDR_SIZE 64 From 03d3963464a43654703938a66503cd686c5fc54e Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:17 -0400 Subject: [PATCH 087/139] kho: make debugfs interface optional Patch series "liveupdate: Rework KHO for in-kernel users", v9. This series refactors the KHO framework to better support in-kernel users like the upcoming LUO. The current design, which relies on a notifier chain and debugfs for control, is too restrictive for direct programmatic use. The core of this rework is the removal of the notifier chain in favor of a direct registration API. This decouples clients from the shutdown-time finalization sequence, allowing them to manage their preserved state more flexibly and at any time. In support of this new model, this series also: - Makes the debugfs interface optional. - Introduces APIs to unpreserve memory and fixes a bug in the abort path where client state was being incorrectly discarded. Note that this is an interim step, as a more comprehensive fix is planned as part of the stateless KHO work [1]. - Moves all KHO code into a new kernel/liveupdate/ directory to consolidate live update components. This patch (of 9): Currently, KHO is controlled via debugfs interface, but once LUO is introduced, it can control KHO, and the debug interface becomes optional. Add a separate config CONFIG_KEXEC_HANDOVER_DEBUGFS that enables the debugfs interface, and allows to inspect the tree. Move all debugfs related code to a new file to keep the .c files clear of ifdefs. Link: https://lkml.kernel.org/r/20251101142325.1326536-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20251101142325.1326536-2-pasha.tatashin@soleen.com Link: https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com [1] Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Tejun Heo Cc: Changyuan Lyu Cc: Jason Gunthorpe Cc: Simon Horman Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- kernel/Kconfig.kexec | 12 +- kernel/Makefile | 1 + kernel/kexec_handover.c | 269 +++++--------------------- kernel/kexec_handover_debugfs.c | 216 +++++++++++++++++++++ kernel/kexec_handover_internal.h | 35 ++++ tools/testing/selftests/kho/vmtest.sh | 1 + 7 files changed, 314 insertions(+), 222 deletions(-) create mode 100644 kernel/kexec_handover_debugfs.c diff --git a/MAINTAINERS b/MAINTAINERS index bd22adb17846..a8a33db191bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13799,7 +13799,7 @@ S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h -F: kernel/kexec_handover.c +F: kernel/kexec_handover* F: lib/test_kho.c F: tools/testing/selftests/kho/ diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 54e581072617..cc6743137946 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -100,7 +100,6 @@ config KEXEC_HANDOVER depends on !DEFERRED_STRUCT_PAGE_INIT select MEMBLOCK_KHO_SCRATCH select KEXEC_FILE - select DEBUG_FS select LIBFDT select CMA help @@ -118,6 +117,17 @@ config KEXEC_HANDOVER_DEBUG scenarios and the extra code might be adding overhead it is only optionally enabled. +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + default KEXEC_HANDOVER + depends on KEXEC_HANDOVER + select DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 9fe722305c9b..2cf7909a74e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 03d12e27189f..befa6ceab574 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -30,6 +29,7 @@ */ #include "../mm/internal.h" #include "kexec_internal.h" +#include "kexec_handover_internal.h" #define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -105,8 +105,6 @@ struct khoser_mem_chunk; struct kho_serialization { struct page *fdt; - struct list_head fdt_list; - struct dentry *sub_fdt_dir; struct kho_mem_track track; /* First chunk of serialized preserved memory map */ struct khoser_mem_chunk *preserved_mem_map; @@ -114,20 +112,16 @@ struct kho_serialization { struct kho_out { struct blocking_notifier_head chain_head; - - struct dentry *dir; - struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; bool finalized; + struct kho_debugfs dbg; }; static struct kho_out kho_out = { .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), .track = { .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), }, @@ -674,37 +668,6 @@ err_disable_kho: kho_enable = false; } -struct fdt_debugfs { - struct list_head list; - struct debugfs_blob_wrapper wrapper; - struct dentry *file; -}; - -static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) -{ - struct fdt_debugfs *f; - struct dentry *file; - - f = kmalloc(sizeof(*f), GFP_KERNEL); - if (!f) - return -ENOMEM; - - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); - - file = debugfs_create_blob(name, 0400, dir, &f->wrapper); - if (IS_ERR(file)) { - kfree(f); - return PTR_ERR(file); - } - - f->file = file; - list_add(&f->list, list); - - return 0; -} - /** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. * @ser: serialization control object passed by KHO notifiers. @@ -716,7 +679,8 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at - * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with + * CONFIG_KEXEC_HANDOVER_DEBUGFS * * Return: 0 on success, error code on failure */ @@ -733,7 +697,7 @@ int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) if (err) return err; - return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); + return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); } EXPORT_SYMBOL_GPL(kho_add_subtree); @@ -1065,30 +1029,7 @@ err_free_pages_array: } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -/* Handling for debug/kho/out */ - -static struct dentry *debugfs_root; - -static int kho_out_update_debugfs_fdt(void) -{ - int err = 0; - struct fdt_debugfs *ff, *tmp; - - if (kho_out.finalized) { - err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, - "fdt", page_to_virt(kho_out.ser.fdt)); - } else { - list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); - } - } - - return err; -} - -static int kho_abort(void) +static int __kho_abort(void) { int err; unsigned long order; @@ -1121,7 +1062,28 @@ static int kho_abort(void) return err; } -static int kho_finalize(void) +int kho_abort(void) +{ + int ret = 0; + + if (!kho_enable) + return -EOPNOTSUPP; + + guard(mutex)(&kho_out.lock); + if (!kho_out.finalized) + return -ENOENT; + + ret = __kho_abort(); + if (ret) + return ret; + + kho_out.finalized = false; + kho_debugfs_cleanup(&kho_out.dbg); + + return 0; +} + +static int __kho_finalize(void) { int err = 0; u64 *preserved_mem_map; @@ -1164,118 +1126,46 @@ static int kho_finalize(void) abort: if (err) { pr_err("Failed to convert KHO state tree: %d\n", err); - kho_abort(); + __kho_abort(); } return err; } -static int kho_out_finalize_get(void *data, u64 *val) +int kho_finalize(void) { - mutex_lock(&kho_out.lock); - *val = kho_out.finalized; - mutex_unlock(&kho_out.lock); + int ret; - return 0; -} + if (!kho_enable) + return -EOPNOTSUPP; -static int kho_out_finalize_set(void *data, u64 _val) -{ - int ret = 0; - bool val = !!_val; - - mutex_lock(&kho_out.lock); - - if (val == kho_out.finalized) { - if (kho_out.finalized) - ret = -EEXIST; - else - ret = -ENOENT; - goto unlock; - } - - if (val) - ret = kho_finalize(); - else - ret = kho_abort(); + guard(mutex)(&kho_out.lock); + if (kho_out.finalized) + return -EEXIST; + ret = __kho_finalize(); if (ret) - goto unlock; + return ret; - kho_out.finalized = val; - ret = kho_out_update_debugfs_fdt(); + kho_out.finalized = true; -unlock: - mutex_unlock(&kho_out.lock); - return ret; + return kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + page_to_virt(kho_out.ser.fdt), true); } -DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); - -static int scratch_phys_show(struct seq_file *m, void *v) +bool kho_finalized(void) { - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].addr); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_phys); - -static int scratch_len_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].size); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_len); - -static __init int kho_out_debugfs_init(void) -{ - struct dentry *dir, *f, *sub_fdt_dir; - - dir = debugfs_create_dir("out", debugfs_root); - if (IS_ERR(dir)) - return -ENOMEM; - - sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); - if (IS_ERR(sub_fdt_dir)) - goto err_rmdir; - - f = debugfs_create_file("scratch_phys", 0400, dir, NULL, - &scratch_phys_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("scratch_len", 0400, dir, NULL, - &scratch_len_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("finalize", 0600, dir, NULL, - &fops_kho_out_finalize); - if (IS_ERR(f)) - goto err_rmdir; - - kho_out.dir = dir; - kho_out.ser.sub_fdt_dir = sub_fdt_dir; - return 0; - -err_rmdir: - debugfs_remove_recursive(dir); - return -ENOENT; + guard(mutex)(&kho_out.lock); + return kho_out.finalized; } struct kho_in { - struct dentry *dir; phys_addr_t fdt_phys; phys_addr_t scratch_phys; - struct list_head fdt_list; + struct kho_debugfs dbg; }; static struct kho_in kho_in = { - .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), }; static const void *kho_get_fdt(void) @@ -1339,56 +1229,6 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); -/* Handling for debugfs/kho/in */ - -static __init int kho_in_debugfs_init(const void *fdt) -{ - struct dentry *sub_fdt_dir; - int err, child; - - kho_in.dir = debugfs_create_dir("in", debugfs_root); - if (IS_ERR(kho_in.dir)) - return PTR_ERR(kho_in.dir); - - sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); - if (IS_ERR(sub_fdt_dir)) { - err = PTR_ERR(sub_fdt_dir); - goto err_rmdir; - } - - err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); - if (err) - goto err_rmdir; - - fdt_for_each_subnode(child, fdt, 0) { - int len = 0; - const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; - - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); - if (!fdt_phys) - continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", - name, len); - continue; - } - err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); - if (err) { - pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, - err); - continue; - } - } - - return 0; - -err_rmdir: - debugfs_remove_recursive(kho_in.dir); - return err; -} - static __init int kho_init(void) { int err = 0; @@ -1403,27 +1243,16 @@ static __init int kho_init(void) goto err_free_scratch; } - debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) { - err = -ENOENT; + err = kho_debugfs_init(); + if (err) goto err_free_fdt; - } - err = kho_out_debugfs_init(); + err = kho_out_debugfs_init(&kho_out.dbg); if (err) goto err_free_fdt; if (fdt) { - err = kho_in_debugfs_init(fdt); - /* - * Failure to create /sys/kernel/debug/kho/in does not prevent - * reviving state from KHO and setting up KHO for the next - * kexec. - */ - if (err) - pr_err("failed exposing handover FDT in debugfs: %d\n", - err); - + kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; } diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c new file mode 100644 index 000000000000..a91b279f1b23 --- /dev/null +++ b/kernel/kexec_handover_debugfs.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debugfs.c - kexec handover debugfs interfaces + * Copyright (C) 2023 Alexander Graf + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport + * Copyright (C) 2025 Google LLC, Changyuan Lyu + * Copyright (C) 2025 Google LLC, Pasha Tatashin + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include +#include +#include +#include +#include "kexec_handover_internal.h" + +static struct dentry *debugfs_root; + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) +{ + struct dentry *dir; + + if (root) + dir = dbg->dir; + else + dir = dbg->sub_fdt_dir; + + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); +} + +void kho_debugfs_cleanup(struct kho_debugfs *dbg) +{ + struct fdt_debugfs *ff, *tmp; + + list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = kho_finalized(); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 val) +{ + if (val) + return kho_finalize(); + else + return kho_abort(); +} + +DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) +{ + struct dentry *dir, *sub_fdt_dir; + int err, child; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto err_out; + } + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node %s prop fdt has invalid length: %d\n", + name, len); + continue; + } + err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt %s to debugfs: %d\n", name, + err); + continue; + } + } + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + + return; +err_rmdir: + debugfs_remove_recursive(dir); +err_out: + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", err); +} + +__init int kho_out_debugfs_init(struct kho_debugfs *dbg) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &kho_out_finalize_fops); + if (IS_ERR(f)) + goto err_rmdir; + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +__init int kho_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) + return -ENOENT; + return 0; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h index 3c3c7148ceed..217b8b25a542 100644 --- a/kernel/kexec_handover_internal.h +++ b/kernel/kexec_handover_internal.h @@ -3,11 +3,46 @@ #define LINUX_KEXEC_HANDOVER_INTERNAL_H #include +#include #include +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +#include + +struct kho_debugfs { + struct dentry *dir; + struct dentry *sub_fdt_dir; + struct list_head fdt_list; +}; + +#else +struct kho_debugfs {}; +#endif + extern struct kho_scratch *kho_scratch; extern unsigned int kho_scratch_cnt; +bool kho_finalized(void); +int kho_finalize(void); +int kho_abort(void); + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +int kho_debugfs_init(void); +void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); +int kho_out_debugfs_init(struct kho_debugfs *dbg); +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root); +void kho_debugfs_cleanup(struct kho_debugfs *dbg); +#else +static inline int kho_debugfs_init(void) { return 0; } +static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, + const void *fdt) { } +static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } +static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) { return 0; } +static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ + #ifdef CONFIG_KEXEC_HANDOVER_DEBUG bool kho_scratch_overlap(phys_addr_t phys, size_t size); #else diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh index 3f6c17166846..49fdac8e8b15 100755 --- a/tools/testing/selftests/kho/vmtest.sh +++ b/tools/testing/selftests/kho/vmtest.sh @@ -59,6 +59,7 @@ function build_kernel() { tee "$kconfig" > "$kho_config" < Date: Sat, 1 Nov 2025 10:23:18 -0400 Subject: [PATCH 088/139] kho: drop notifiers The KHO framework uses a notifier chain as the mechanism for clients to participate in the finalization process. While this works for a single, central state machine, it is too restrictive for kernel-internal components like pstore/reserve_mem or IMA. These components need a simpler, direct way to register their state for preservation (e.g., during their initcall) without being part of a complex, shutdown-time notifier sequence. The notifier model forces all participants into a single finalization flow and makes direct preservation from an arbitrary context difficult. This patch refactors the client participation model by removing the notifier chain and introducing a direct API for managing FDT subtrees. The core kho_finalize() and kho_abort() state machine remains, but clients now register their data with KHO beforehand. Link: https://lkml.kernel.org/r/20251101142325.1326536-3-pasha.tatashin@soleen.com Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 32 ++---- kernel/kexec_handover.c | 164 +++++++++++++++++-------------- kernel/kexec_handover_debugfs.c | 15 +-- kernel/kexec_handover_internal.h | 5 +- lib/test_kho.c | 35 +------ mm/memblock.c | 62 +++--------- 6 files changed, 125 insertions(+), 188 deletions(-) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 25042c1d8d54..0d860d793b66 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -10,14 +10,7 @@ struct kho_scratch { phys_addr_t size; }; -/* KHO Notifier index */ -enum kho_event { - KEXEC_KHO_FINALIZE = 0, - KEXEC_KHO_ABORT = 1, -}; - struct folio; -struct notifier_block; struct page; #define DECLARE_KHOSER_PTR(name, type) \ @@ -37,8 +30,6 @@ struct page; (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ }) -struct kho_serialization; - struct kho_vmalloc_chunk; struct kho_vmalloc { DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); @@ -57,12 +48,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_add_subtree(const char *name, void *fdt); +void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); -int register_kho_notifier(struct notifier_block *nb); -int unregister_kho_notifier(struct notifier_block *nb); - void kho_memory_init(void); void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, @@ -110,27 +99,20 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(struct kho_serialization *ser, - const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *fdt) { return -EOPNOTSUPP; } +static inline void kho_remove_subtree(void *fdt) +{ +} + static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } -static inline int register_kho_notifier(struct notifier_block *nb) -{ - return -EOPNOTSUPP; -} - -static inline int unregister_kho_notifier(struct notifier_block *nb) -{ - return -EOPNOTSUPP; -} - static inline void kho_memory_init(void) { } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index befa6ceab574..3dd917bfedcc 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -103,29 +102,34 @@ struct kho_mem_track { struct khoser_mem_chunk; -struct kho_serialization { - struct page *fdt; - struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; +struct kho_sub_fdt { + struct list_head l; + const char *name; + void *fdt; }; struct kho_out { - struct blocking_notifier_head chain_head; - struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; + void *fdt; bool finalized; + struct mutex lock; /* protects KHO FDT finalization */ + + struct list_head sub_fdts; + struct mutex fdts_lock; + + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; + struct kho_debugfs dbg; }; static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), }, + .sub_fdts = LIST_HEAD_INIT(kho_out.sub_fdts), + .fdts_lock = __MUTEX_INITIALIZER(kho_out.fdts_lock), .finalized = false, }; @@ -369,7 +373,7 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) } } -static int kho_mem_serialize(struct kho_serialization *ser) +static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; struct khoser_mem_chunk *chunk = NULL; @@ -377,7 +381,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) unsigned long order; int err = -ENOMEM; - xa_for_each(&ser->track.orders, order, physxa) { + xa_for_each(&kho_out->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; @@ -409,7 +413,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) } } - ser->preserved_mem_map = first_chunk; + kho_out->preserved_mem_map = first_chunk; return 0; @@ -670,7 +674,6 @@ err_disable_kho: /** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. - * @ser: serialization control object passed by KHO notifiers. * @name: name of the sub tree. * @fdt: the sub tree blob. * @@ -684,34 +687,41 @@ err_disable_kho: * * Return: 0 on success, error code on failure */ -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +int kho_add_subtree(const char *name, void *fdt) { - int err = 0; - u64 phys = (u64)virt_to_phys(fdt); - void *root = page_to_virt(ser->fdt); + struct kho_sub_fdt *sub_fdt; - err |= fdt_begin_node(root, name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); + sub_fdt = kmalloc(sizeof(*sub_fdt), GFP_KERNEL); + if (!sub_fdt) + return -ENOMEM; - if (err) - return err; + INIT_LIST_HEAD(&sub_fdt->l); + sub_fdt->name = name; + sub_fdt->fdt = fdt; - return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); + guard(mutex)(&kho_out.fdts_lock); + list_add_tail(&sub_fdt->l, &kho_out.sub_fdts); + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + + return 0; } EXPORT_SYMBOL_GPL(kho_add_subtree); -int register_kho_notifier(struct notifier_block *nb) +void kho_remove_subtree(void *fdt) { - return blocking_notifier_chain_register(&kho_out.chain_head, nb); -} -EXPORT_SYMBOL_GPL(register_kho_notifier); + struct kho_sub_fdt *sub_fdt; -int unregister_kho_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); + guard(mutex)(&kho_out.fdts_lock); + list_for_each_entry(sub_fdt, &kho_out.sub_fdts, l) { + if (sub_fdt->fdt == fdt) { + list_del(&sub_fdt->l); + kfree(sub_fdt); + kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + break; + } + } } -EXPORT_SYMBOL_GPL(unregister_kho_notifier); +EXPORT_SYMBOL_GPL(kho_remove_subtree); /** * kho_preserve_folio - preserve a folio across kexec. @@ -726,7 +736,7 @@ int kho_preserve_folio(struct folio *folio) { const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; @@ -747,7 +757,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -849,7 +859,7 @@ err_free: static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, unsigned short order) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); @@ -1031,11 +1041,11 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc); static int __kho_abort(void) { - int err; + int err = 0; unsigned long order; struct kho_mem_phys *physxa; - xa_for_each(&kho_out.ser.track.orders, order, physxa) { + xa_for_each(&kho_out.track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; @@ -1045,17 +1055,13 @@ static int __kho_abort(void) xa_destroy(&physxa->phys_bits); kfree(physxa); } - xa_destroy(&kho_out.ser.track.orders); + xa_destroy(&kho_out.track.orders); - if (kho_out.ser.preserved_mem_map) { - kho_mem_ser_free(kho_out.ser.preserved_mem_map); - kho_out.ser.preserved_mem_map = NULL; + if (kho_out.preserved_mem_map) { + kho_mem_ser_free(kho_out.preserved_mem_map); + kho_out.preserved_mem_map = NULL; } - err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, - NULL); - err = notifier_to_errno(err); - if (err) pr_err("Failed to abort KHO finalization: %d\n", err); @@ -1078,7 +1084,8 @@ int kho_abort(void) return ret; kho_out.finalized = false; - kho_debugfs_cleanup(&kho_out.dbg); + + kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt); return 0; } @@ -1087,41 +1094,46 @@ static int __kho_finalize(void) { int err = 0; u64 *preserved_mem_map; - void *fdt = page_to_virt(kho_out.ser.fdt); + void *root = kho_out.fdt; + struct kho_sub_fdt *fdt; - err |= fdt_create(fdt, PAGE_SIZE); - err |= fdt_finish_reservemap(fdt); - err |= fdt_begin_node(fdt, ""); - err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); /** * Reserve the preserved-memory-map property in the root FDT, so * that all property definitions will precede subnodes created by * KHO callers. */ - err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + err |= fdt_property_placeholder(root, PROP_PRESERVED_MEMORY_MAP, sizeof(*preserved_mem_map), (void **)&preserved_mem_map); if (err) goto abort; - err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); + err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); if (err) goto abort; - err = blocking_notifier_call_chain(&kho_out.chain_head, - KEXEC_KHO_FINALIZE, &kho_out.ser); - err = notifier_to_errno(err); + err = kho_mem_serialize(&kho_out); if (err) goto abort; - err = kho_mem_serialize(&kho_out.ser); - if (err) - goto abort; + *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); - *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + mutex_lock(&kho_out.fdts_lock); + list_for_each_entry(fdt, &kho_out.sub_fdts, l) { + phys_addr_t phys = virt_to_phys(fdt->fdt); - err |= fdt_end_node(fdt); - err |= fdt_finish(fdt); + err |= fdt_begin_node(root, fdt->name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + } + mutex_unlock(&kho_out.fdts_lock); + + err |= fdt_end_node(root); + err |= fdt_finish(root); abort: if (err) { @@ -1149,8 +1161,10 @@ int kho_finalize(void) kho_out.finalized = true; - return kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - page_to_virt(kho_out.ser.fdt), true); + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + + return 0; } bool kho_finalized(void) @@ -1233,15 +1247,17 @@ static __init int kho_init(void) { int err = 0; const void *fdt = kho_get_fdt(); + struct page *fdt_page; if (!kho_enable) return 0; - kho_out.ser.fdt = alloc_page(GFP_KERNEL); - if (!kho_out.ser.fdt) { + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) { err = -ENOMEM; goto err_free_scratch; } + kho_out.fdt = page_to_virt(fdt_page); err = kho_debugfs_init(); if (err) @@ -1269,8 +1285,8 @@ static __init int kho_init(void) return 0; err_free_fdt: - put_page(kho_out.ser.fdt); - kho_out.ser.fdt = NULL; + put_page(fdt_page); + kho_out.fdt = NULL; err_free_scratch: for (int i = 0; i < kho_scratch_cnt; i++) { void *start = __va(kho_scratch[i].addr); @@ -1281,7 +1297,7 @@ err_free_scratch: kho_enable = false; return err; } -late_initcall(kho_init); +fs_initcall(kho_init); static void __init kho_release_scratch(void) { @@ -1417,7 +1433,7 @@ int kho_fill_kimage(struct kimage *image) if (!kho_out.finalized) return 0; - image->kho.fdt = page_to_phys(kho_out.ser.fdt); + image->kho.fdt = virt_to_phys(kho_out.fdt); scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; scratch = (struct kexec_buf){ diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c index a91b279f1b23..46e9e6c0791f 100644 --- a/kernel/kexec_handover_debugfs.c +++ b/kernel/kexec_handover_debugfs.c @@ -61,14 +61,17 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); } -void kho_debugfs_cleanup(struct kho_debugfs *dbg) +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) { - struct fdt_debugfs *ff, *tmp; + struct fdt_debugfs *ff; - list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); + list_for_each_entry(ff, &dbg->fdt_list, list) { + if (ff->wrapper.data == fdt) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + break; + } } } diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h index 217b8b25a542..52ed73659fe6 100644 --- a/kernel/kexec_handover_internal.h +++ b/kernel/kexec_handover_internal.h @@ -32,7 +32,7 @@ void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, const void *fdt, bool root); -void kho_debugfs_cleanup(struct kho_debugfs *dbg); +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); #else static inline int kho_debugfs_init(void) { return 0; } static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, @@ -40,7 +40,8 @@ static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, const void *fdt, bool root) { return 0; } -static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {} +static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, + void *fdt) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ #ifdef CONFIG_KEXEC_HANDOVER_DEBUG diff --git a/lib/test_kho.c b/lib/test_kho.c index fff018e5548d..27618c5b4796 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -39,33 +39,6 @@ struct kho_test_state { static struct kho_test_state kho_test_state; -static int kho_test_notifier(struct notifier_block *self, unsigned long cmd, - void *v) -{ - struct kho_test_state *state = &kho_test_state; - struct kho_serialization *ser = v; - int err = 0; - - switch (cmd) { - case KEXEC_KHO_ABORT: - return NOTIFY_DONE; - case KEXEC_KHO_FINALIZE: - /* Handled below */ - break; - default: - return NOTIFY_BAD; - } - - err |= kho_preserve_folio(state->fdt); - err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt)); - - return err ? NOTIFY_BAD : NOTIFY_DONE; -} - -static struct notifier_block kho_test_nb = { - .notifier_call = kho_test_notifier, -}; - static int kho_test_save_data(struct kho_test_state *state, void *fdt) { phys_addr_t *folios_info __free(kvfree) = NULL; @@ -120,6 +93,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state) fdt = folio_address(state->fdt); + err |= kho_preserve_folio(state->fdt); err |= fdt_create(fdt, fdt_size); err |= fdt_finish_reservemap(fdt); @@ -131,6 +105,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state) err |= fdt_finish(fdt); + err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); if (err) folio_put(state->fdt); @@ -203,10 +178,6 @@ static int kho_test_save(void) if (err) goto err_free_folios; - err = register_kho_notifier(&kho_test_nb); - if (err) - goto err_free_fdt; - return 0; err_free_fdt: @@ -329,7 +300,7 @@ static void kho_test_cleanup(void) static void __exit kho_test_exit(void) { - unregister_kho_notifier(&kho_test_nb); + kho_remove_subtree(folio_address(kho_test_state.fdt)); kho_test_cleanup(); } module_exit(kho_test_exit); diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..e3bef9b35d63 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2444,53 +2444,18 @@ int reserve_mem_release_by_name(const char *name) #define MEMBLOCK_KHO_FDT "memblock" #define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" #define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" -static struct page *kho_fdt; - -static int reserve_mem_kho_finalize(struct kho_serialization *ser) -{ - int err = 0, i; - - for (i = 0; i < reserved_mem_count; i++) { - struct reserve_mem_table *map = &reserved_mem_table[i]; - struct page *page = phys_to_page(map->start); - unsigned int nr_pages = map->size >> PAGE_SHIFT; - - err |= kho_preserve_pages(page, nr_pages); - } - - err |= kho_preserve_folio(page_folio(kho_fdt)); - err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); - - return notifier_from_errno(err); -} - -static int reserve_mem_kho_notifier(struct notifier_block *self, - unsigned long cmd, void *v) -{ - switch (cmd) { - case KEXEC_KHO_FINALIZE: - return reserve_mem_kho_finalize((struct kho_serialization *)v); - case KEXEC_KHO_ABORT: - return NOTIFY_DONE; - default: - return NOTIFY_BAD; - } -} - -static struct notifier_block reserve_mem_kho_nb = { - .notifier_call = reserve_mem_kho_notifier, -}; static int __init prepare_kho_fdt(void) { int err = 0, i; + struct page *fdt_page; void *fdt; - kho_fdt = alloc_page(GFP_KERNEL); - if (!kho_fdt) + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) return -ENOMEM; - fdt = page_to_virt(kho_fdt); + fdt = page_to_virt(fdt_page); err |= fdt_create(fdt, PAGE_SIZE); err |= fdt_finish_reservemap(fdt); @@ -2499,7 +2464,10 @@ static int __init prepare_kho_fdt(void) err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); for (i = 0; i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; + err |= kho_preserve_pages(page, nr_pages); err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); @@ -2507,13 +2475,16 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); } err |= fdt_end_node(fdt); - err |= fdt_finish(fdt); + err |= kho_preserve_folio(page_folio(fdt_page)); + + if (!err) + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + if (err) { pr_err("failed to prepare memblock FDT for KHO: %d\n", err); - put_page(kho_fdt); - kho_fdt = NULL; + put_page(fdt_page); } return err; @@ -2529,13 +2500,6 @@ static int __init reserve_mem_init(void) err = prepare_kho_fdt(); if (err) return err; - - err = register_kho_notifier(&reserve_mem_kho_nb); - if (err) { - put_page(kho_fdt); - kho_fdt = NULL; - } - return err; } late_initcall(reserve_mem_init); From 36f8f7ef7fd2f238922e9d217e86c69838319d8c Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:19 -0400 Subject: [PATCH 089/139] kho: add interfaces to unpreserve folios, page ranges, and vmalloc Allow users of KHO to cancel the previous preservation by adding the necessary interfaces to unpreserve folio, pages, and vmallocs. Link: https://lkml.kernel.org/r/20251101142325.1326536-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 18 ++++++ kernel/kexec_handover.c | 106 ++++++++++++++++++++++++++++----- 2 files changed, 110 insertions(+), 14 deletions(-) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 0d860d793b66..80ece4232617 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -43,8 +43,11 @@ bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); +int kho_unpreserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); +int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); @@ -72,17 +75,32 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; } +static inline int kho_unpreserve_folio(struct folio *folio) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; } +static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) { return -EOPNOTSUPP; } +static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +{ + return -EOPNOTSUPP; +} + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 3dd917bfedcc..4e033f96637d 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -157,26 +157,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) return no_free_ptr(elm); } -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) { struct kho_mem_phys_bits *bits; struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); + if (WARN_ON_ONCE(!physxa)) + return; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (WARN_ON_ONCE(!bits)) + return; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + unsigned int order; while (pfn < end_pfn) { - const unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - const unsigned long pfn_high = pfn >> order; + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - physxa = xa_load(&track->orders, order); - if (WARN_ON_ONCE(!physxa)) - return; - - bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (WARN_ON_ONCE(!bits)) - return; - - clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + __kho_unpreserve_order(track, pfn, order); pfn += 1 << order; } @@ -745,6 +752,30 @@ int kho_preserve_folio(struct folio *folio) } EXPORT_SYMBOL_GPL(kho_preserve_folio); +/** + * kho_unpreserve_folio - unpreserve a folio. + * @folio: folio to unpreserve. + * + * Instructs KHO to unpreserve a folio that was preserved by + * kho_preserve_folio() before. The provided @folio (pfn and order) + * must exactly match a previously preserved folio. + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + if (kho_out.finalized) + return -EBUSY; + + __kho_unpreserve_order(track, pfn, order); + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_folio); + /** * kho_preserve_pages - preserve contiguous pages across kexec * @page: first page in the list. @@ -789,6 +820,33 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages); +/** + * kho_unpreserve_pages - unpreserve contiguous pages. + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. + * This must be called with the same @page and @nr_pages as the corresponding + * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger + * preserved blocks is not supported. + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + + if (kho_out.finalized) + return -EBUSY; + + __kho_unpreserve(track, start_pfn, end_pfn); + + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_pages); + struct kho_vmalloc_hdr { DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); }; @@ -950,6 +1008,26 @@ err_free: } EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); +/** + * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc() + * @preservation: preservation metadata returned by kho_preserve_vmalloc() + * + * Instructs KHO to unpreserve the area in vmalloc address space that was + * previously preserved with kho_preserve_vmalloc(). + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +{ + if (kho_out.finalized) + return -EBUSY; + + kho_vmalloc_free_chunks(preservation); + + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); + /** * kho_restore_vmalloc - recreates and populates an area in vmalloc address * space from the preserved memory. From f5bfd4793a933c7074b0d3748068b255d232bfa0 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:20 -0400 Subject: [PATCH 090/139] memblock: unpreserve memory in case of error If there is an error half way through KHO memory preservation, we should rollback and unpreserve everything that is partially preserved. [akpm@linux-foundation.org: s/err_no_fdt_page/err_report/ in prepare_kho_fdt(), per Mike] Link: https://lkml.kernel.org/r/20251101142325.1326536-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Suggested-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- mm/memblock.c | 91 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 24 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index e3bef9b35d63..c7869860e659 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2445,29 +2445,60 @@ int reserve_mem_release_by_name(const char *name) #define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" #define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" -static int __init prepare_kho_fdt(void) +static int __init reserved_mem_preserve(void) { - int err = 0, i; - struct page *fdt_page; - void *fdt; + unsigned int nr_preserved = 0; + int err; - fdt_page = alloc_page(GFP_KERNEL); - if (!fdt_page) - return -ENOMEM; - - fdt = page_to_virt(fdt_page); - - err |= fdt_create(fdt, PAGE_SIZE); - err |= fdt_finish_reservemap(fdt); - - err |= fdt_begin_node(fdt, ""); - err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); - for (i = 0; i < reserved_mem_count; i++) { + for (unsigned int i = 0; i < reserved_mem_count; i++, nr_preserved++) { struct reserve_mem_table *map = &reserved_mem_table[i]; struct page *page = phys_to_page(map->start); unsigned int nr_pages = map->size >> PAGE_SHIFT; - err |= kho_preserve_pages(page, nr_pages); + err = kho_preserve_pages(page, nr_pages); + if (err) + goto err_unpreserve; + } + + return 0; + +err_unpreserve: + for (unsigned int i = 0; i < nr_preserved; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; + + kho_unpreserve_pages(page, nr_pages); + } + + return err; +} + +static int __init prepare_kho_fdt(void) +{ + struct page *fdt_page; + void *fdt; + int err; + + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) { + err = -ENOMEM; + goto err_report; + } + + fdt = page_to_virt(fdt_page); + err = kho_preserve_pages(fdt_page, 1); + if (err) + goto err_free_fdt; + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); + + for (unsigned int i = 0; !err && i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); @@ -2477,15 +2508,27 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); err |= fdt_finish(fdt); - err |= kho_preserve_folio(page_folio(fdt_page)); + if (err) + goto err_unpreserve_fdt; - if (!err) - err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + if (err) + goto err_unpreserve_fdt; - if (err) { - pr_err("failed to prepare memblock FDT for KHO: %d\n", err); - put_page(fdt_page); - } + err = reserved_mem_preserve(); + if (err) + goto err_remove_subtree; + + return 0; + +err_remove_subtree: + kho_remove_subtree(fdt); +err_unpreserve_fdt: + kho_unpreserve_pages(fdt_page, 1); +err_free_fdt: + put_page(fdt_page); +err_report: + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); return err; } From ce405ed5102018946f20b69c1e7ae49697dcf616 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:21 -0400 Subject: [PATCH 091/139] test_kho: unpreserve memory in case of error If there is an error half way through KHO memory preservation, we should rollback and unpreserve everything that is partially preserved. Link: https://lkml.kernel.org/r/20251101142325.1326536-6-pasha.tatashin@soleen.com Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- lib/test_kho.c | 103 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 30 deletions(-) diff --git a/lib/test_kho.c b/lib/test_kho.c index 27618c5b4796..85b60d87a50a 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -33,17 +33,28 @@ struct kho_test_state { unsigned int nr_folios; struct folio **folios; phys_addr_t *folios_info; + struct kho_vmalloc folios_info_phys; + int nr_folios_preserved; struct folio *fdt; __wsum csum; }; static struct kho_test_state kho_test_state; -static int kho_test_save_data(struct kho_test_state *state, void *fdt) +static void kho_test_unpreserve_data(struct kho_test_state *state) +{ + for (int i = 0; i < state->nr_folios_preserved; i++) + kho_unpreserve_folio(state->folios[i]); + + kho_unpreserve_vmalloc(&state->folios_info_phys); + vfree(state->folios_info); +} + +static int kho_test_preserve_data(struct kho_test_state *state) { - phys_addr_t *folios_info __free(kvfree) = NULL; struct kho_vmalloc folios_info_phys; - int err = 0; + phys_addr_t *folios_info; + int err; folios_info = vmalloc_array(state->nr_folios, sizeof(*folios_info)); if (!folios_info) @@ -51,64 +62,98 @@ static int kho_test_save_data(struct kho_test_state *state, void *fdt) err = kho_preserve_vmalloc(folios_info, &folios_info_phys); if (err) - return err; + goto err_free_info; + + state->folios_info_phys = folios_info_phys; + state->folios_info = folios_info; for (int i = 0; i < state->nr_folios; i++) { struct folio *folio = state->folios[i]; unsigned int order = folio_order(folio); folios_info[i] = virt_to_phys(folio_address(folio)) | order; - err = kho_preserve_folio(folio); if (err) - break; + goto err_unpreserve; + state->nr_folios_preserved++; } + return 0; + +err_unpreserve: + /* + * kho_test_unpreserve_data frees folio_info, bail out immediately to + * avoid double free + */ + kho_test_unpreserve_data(state); + return err; + +err_free_info: + vfree(folios_info); + return err; +} + +static int kho_test_prepare_fdt(struct kho_test_state *state, ssize_t fdt_size) +{ + const char compatible[] = KHO_TEST_COMPAT; + unsigned int magic = KHO_TEST_MAGIC; + void *fdt = folio_address(state->fdt); + int err; + + err = fdt_create(fdt, fdt_size); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible)); + err |= fdt_property(fdt, "magic", &magic, sizeof(magic)); + err |= fdt_begin_node(fdt, "data"); err |= fdt_property(fdt, "nr_folios", &state->nr_folios, sizeof(state->nr_folios)); - err |= fdt_property(fdt, "folios_info", &folios_info_phys, - sizeof(folios_info_phys)); + err |= fdt_property(fdt, "folios_info", &state->folios_info_phys, + sizeof(state->folios_info_phys)); err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum)); err |= fdt_end_node(fdt); - if (!err) - state->folios_info = no_free_ptr(folios_info); + err |= fdt_end_node(fdt); + err |= fdt_finish(fdt); return err; } -static int kho_test_prepare_fdt(struct kho_test_state *state) +static int kho_test_preserve(struct kho_test_state *state) { - const char compatible[] = KHO_TEST_COMPAT; - unsigned int magic = KHO_TEST_MAGIC; ssize_t fdt_size; - int err = 0; - void *fdt; + int err; fdt_size = state->nr_folios * sizeof(phys_addr_t) + PAGE_SIZE; state->fdt = folio_alloc(GFP_KERNEL, get_order(fdt_size)); if (!state->fdt) return -ENOMEM; - fdt = folio_address(state->fdt); + err = kho_preserve_folio(state->fdt); + if (err) + goto err_free_fdt; - err |= kho_preserve_folio(state->fdt); - err |= fdt_create(fdt, fdt_size); - err |= fdt_finish_reservemap(fdt); + err = kho_test_preserve_data(state); + if (err) + goto err_unpreserve_fdt; - err |= fdt_begin_node(fdt, ""); - err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible)); - err |= fdt_property(fdt, "magic", &magic, sizeof(magic)); - err |= kho_test_save_data(state, fdt); - err |= fdt_end_node(fdt); - - err |= fdt_finish(fdt); + err = kho_test_prepare_fdt(state, fdt_size); + if (err) + goto err_unpreserve_data; err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); if (err) - folio_put(state->fdt); + goto err_unpreserve_data; + return 0; + +err_unpreserve_data: + kho_test_unpreserve_data(state); +err_unpreserve_fdt: + kho_unpreserve_folio(state->fdt); +err_free_fdt: + folio_put(state->fdt); return err; } @@ -174,14 +219,12 @@ static int kho_test_save(void) if (err) goto err_free_folios; - err = kho_test_prepare_fdt(state); + err = kho_test_preserve(state); if (err) goto err_free_folios; return 0; -err_free_fdt: - folio_put(state->fdt); err_free_folios: kvfree(folios); return err; From 99cd2ffac697be7f027344d77beee0dc04dcfdbd Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:22 -0400 Subject: [PATCH 092/139] kho: don't unpreserve memory during abort KHO allows clients to preserve memory regions at any point before the KHO state is finalized. The finalization process itself involves KHO performing its own actions, such as serializing the overall preserved memory map. If this finalization process is aborted, the current implementation destroys KHO's internal memory tracking structures (`kho_out.ser.track.orders`). This behavior effectively unpreserves all memory from KHO's perspective, regardless of whether those preservations were made by clients before the finalization attempt or by KHO itself during finalization. This premature unpreservation is incorrect. An abort of the finalization process should only undo actions taken by KHO as part of that specific finalization attempt. Individual memory regions preserved by clients prior to finalization should remain preserved, as their lifecycle is managed by the clients themselves. These clients might still need to call kho_unpreserve_folio() or kho_unpreserve_phys() based on their own logic, even after a KHO finalization attempt is aborted. Link: https://lkml.kernel.org/r/20251101142325.1326536-7-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: "Mike Rapoport (Microsoft)" Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 4e033f96637d..0a4a058fbf0c 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1119,31 +1119,12 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc); static int __kho_abort(void) { - int err = 0; - unsigned long order; - struct kho_mem_phys *physxa; - - xa_for_each(&kho_out.track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; - - xa_for_each(&physxa->phys_bits, phys, bits) - kfree(bits); - - xa_destroy(&physxa->phys_bits); - kfree(physxa); - } - xa_destroy(&kho_out.track.orders); - if (kho_out.preserved_mem_map) { kho_mem_ser_free(kho_out.preserved_mem_map); kho_out.preserved_mem_map = NULL; } - if (err) - pr_err("Failed to abort KHO finalization: %d\n", err); - - return err; + return 0; } int kho_abort(void) From 48a1b2321d763b5edeaf20bd4576d8c4b5df772b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:23 -0400 Subject: [PATCH 093/139] liveupdate: kho: move to kernel/liveupdate Move KHO to kernel/liveupdate/ in preparation of placing all Live Update core kernel related files to the same place. [pasha.tatashin@soleen.com: disable the menu when DEFERRED_STRUCT_PAGE_INIT] Link: https://lkml.kernel.org/r/CA+CK2bAvh9Oa2SLfsbJ8zztpEjrgr_hr-uGgF1coy8yoibT39A@mail.gmail.com Link: https://lkml.kernel.org/r/20251101142325.1326536-8-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- Documentation/core-api/kho/concepts.rst | 2 +- MAINTAINERS | 2 +- init/Kconfig | 2 + kernel/Kconfig.kexec | 34 ---------------- kernel/Makefile | 4 +- kernel/liveupdate/Kconfig | 40 +++++++++++++++++++ kernel/liveupdate/Makefile | 5 +++ kernel/{ => liveupdate}/kexec_handover.c | 4 +- .../{ => liveupdate}/kexec_handover_debug.c | 0 .../{ => liveupdate}/kexec_handover_debugfs.c | 0 .../kexec_handover_internal.h | 0 11 files changed, 52 insertions(+), 41 deletions(-) create mode 100644 kernel/liveupdate/Kconfig create mode 100644 kernel/liveupdate/Makefile rename kernel/{ => liveupdate}/kexec_handover.c (99%) rename kernel/{ => liveupdate}/kexec_handover_debug.c (100%) rename kernel/{ => liveupdate}/kexec_handover_debugfs.c (100%) rename kernel/{ => liveupdate}/kexec_handover_internal.h (100%) diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst index 36d5c05cfb30..d626d1dbd678 100644 --- a/Documentation/core-api/kho/concepts.rst +++ b/Documentation/core-api/kho/concepts.rst @@ -70,5 +70,5 @@ in the FDT. That state is called the KHO finalization phase. Public API ========== -.. kernel-doc:: kernel/kexec_handover.c +.. kernel-doc:: kernel/liveupdate/kexec_handover.c :export: diff --git a/MAINTAINERS b/MAINTAINERS index a8a33db191bb..99fccc12c1f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13799,7 +13799,7 @@ S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h -F: kernel/kexec_handover* +F: kernel/liveupdate/kexec_handover* F: lib/test_kho.c F: tools/testing/selftests/kho/ diff --git a/init/Kconfig b/init/Kconfig index 56a5dec1fdfc..5ec572cd075d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2156,6 +2156,8 @@ config TRACEPOINTS source "kernel/Kconfig.kexec" +source "kernel/liveupdate/Kconfig" + endmenu # General setup source "arch/Kconfig" diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index cc6743137946..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,40 +94,6 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC -config KEXEC_HANDOVER - bool "kexec handover" - depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT - select MEMBLOCK_KHO_SCRATCH - select KEXEC_FILE - select LIBFDT - select CMA - help - Allow kexec to hand over state across kernels by generating and - passing additional metadata to the target kernel. This is useful - to keep data or state alive across the kexec. For this to work, - both source and target kernels need to have this option enabled. - -config KEXEC_HANDOVER_DEBUG - bool "Enable Kexec Handover debug checks" - depends on KEXEC_HANDOVER - help - This option enables extra sanity checks for the Kexec Handover - subsystem. Since, KHO performance is crucial in live update - scenarios and the extra code might be adding overhead it is - only optionally enabled. - -config KEXEC_HANDOVER_DEBUGFS - bool "kexec handover debugfs interface" - default KEXEC_HANDOVER - depends on KEXEC_HANDOVER - select DEBUG_FS - help - Allow to control kexec handover device tree via debugfs - interface, i.e. finalize the state or aborting the finalization. - Also, enables inspecting the KHO fdt trees with the debugfs binary - blobs. - config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 2cf7909a74e5..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -52,6 +52,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ obj-y += unwind/ @@ -82,9 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o -obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..eae428309332 --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + depends on !DEFERRED_STRUCT_PAGE_INIT + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + default KEXEC_HANDOVER + depends on KEXEC_HANDOVER + select DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..f52ce1ebcf86 --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c similarity index 99% rename from kernel/kexec_handover.c rename to kernel/liveupdate/kexec_handover.c index 0a4a058fbf0c..52cd4dc23e2a 100644 --- a/kernel/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -26,8 +26,8 @@ * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. */ -#include "../mm/internal.h" -#include "kexec_internal.h" +#include "../../mm/internal.h" +#include "../kexec_internal.h" #include "kexec_handover_internal.h" #define KHO_FDT_COMPATIBLE "kho-v1" diff --git a/kernel/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c similarity index 100% rename from kernel/kexec_handover_debug.c rename to kernel/liveupdate/kexec_handover_debug.c diff --git a/kernel/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c similarity index 100% rename from kernel/kexec_handover_debugfs.c rename to kernel/liveupdate/kexec_handover_debugfs.c diff --git a/kernel/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h similarity index 100% rename from kernel/kexec_handover_internal.h rename to kernel/liveupdate/kexec_handover_internal.h From c332ebd9c0c4d409599b6d4a49705d449393c77b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:24 -0400 Subject: [PATCH 094/139] MAINTAINERS: update KHO maintainers Changyuan does not have cycles to commit to the upstream work of KHO. Remove from KHO maintainers. Link: https://lkml.kernel.org/r/20251101142325.1326536-9-pasha.tatashin@soleen.com Signed-off-by: Changyuan Lyu Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 99fccc12c1f6..b46425e3b4d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13791,7 +13791,7 @@ F: kernel/kexec* KEXEC HANDOVER (KHO) M: Alexander Graf M: Mike Rapoport -M: Changyuan Lyu +M: Pasha Tatashin R: Pratyush Yadav L: kexec@lists.infradead.org L: linux-mm@kvack.org From 8db839caeed9efc1b80f20db2a128a3dc093426f Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sat, 1 Nov 2025 10:23:25 -0400 Subject: [PATCH 095/139] liveupdate: kho: use %pe format specifier for error pointer printing Make pr_xxx() call to use the %pe format specifier instead of %d. The %pe specifier prints a symbolic error string (e.g., -ENOMEM, -EINVAL) when given an error pointer created with ERR_PTR(err). This change enhances the clarity and diagnostic value of the error message by showing a descriptive error name rather than a numeric error code. Note, that some err are still printed by value, as those errors might come from libfdt and not regular errnos. Link: https://lkml.kernel.org/r/20251101142325.1326536-10-pasha.tatashin@soleen.com Signed-off-by: Zhu Yanjun Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Reviewed-by: Simon Horman Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: "Mike Rapoport (Microsoft)" Cc: Randy Dunlap Cc: Tejun Heo Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 4 ++-- kernel/liveupdate/kexec_handover_debugfs.c | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 52cd4dc23e2a..9f0913e101be 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1449,8 +1449,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, memblock_add(area->addr, size); err = memblock_mark_kho_scratch(area->addr, size); if (WARN_ON(err)) { - pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", - &area->addr, &size, err); + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", + &area->addr, &size, ERR_PTR(err)); goto out; } pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 46e9e6c0791f..ac739d25094d 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -150,8 +150,8 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, phys_to_virt(*fdt_phys)); if (err) { - pr_warn("failed to add fdt %s to debugfs: %d\n", name, - err); + pr_warn("failed to add fdt %s to debugfs: %pe\n", name, + ERR_PTR(err)); continue; } } @@ -168,8 +168,10 @@ err_out: * reviving state from KHO and setting up KHO for the next * kexec. */ - if (err) - pr_err("failed exposing handover FDT in debugfs: %d\n", err); + if (err) { + pr_err("failed exposing handover FDT in debugfs: %pe\n", + ERR_PTR(err)); + } } __init int kho_out_debugfs_init(struct kho_debugfs *dbg) From 077a4851b0024e3025e57e428767ce0c7b0359bf Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:50 -0500 Subject: [PATCH 096/139] kho: fix misleading log message in kho_populate() Patch series "kho: simplify state machine and enable dynamic updates", v2. This patch series refactors the Kexec Handover subsystem to transition from a rigid, state-locked model to a dynamic, re-entrant architecture. It also introduces usability improvements. Motivation Currently, KHO relies on a strict state machine where memory preservation is locked upon finalization. If a change is required, the user must explicitly "abort" to reset the state. Additionally, the kexec image cannot be loaded until KHO is finalized, and the FDT is rebuilt from scratch on every finalization. This series simplifies this workflow to support "load early, finalize late" scenarios. Key Changes State Machine Simplification: - Removed kho_abort(). kho_finalize() is now re-entrant; calling it a second time automatically flushes the previous serialized state and generates a fresh one. - Removed kho_out.finalized checks from preservation APIs, allowing drivers to add/remove pages even after an initial finalization. - Decoupled kexec_file_load from KHO finalization. The KHO FDT physical address is now stable from boot, allowing the kexec image to be loaded before the handover metadata is finalized. FDT Management: - The FDT is now updated in-place dynamically when subtrees are added or removed, removing the need for complex reconstruction logic. - The output FDT is always exposed in debugfs (initialized and zeroed at boot), improving visibility and debugging capabilities throughout the system lifecycle. - Removed the redundant global preserved_mem_map pointer, establishing the FDT property as the single source of truth. New Features & API Enhancements: - High-Level Allocators: Introduced kho_alloc_preserve() and friends to reduce boilerplate for drivers that need to allocate, preserve, and eventually restore simple memory buffers. - Configuration: Added CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT to allow KHO to be active by default without requiring the kho=on command line parameter. Fixes: - Fixed potential alignment faults when accessing 64-bit FDT properties. - Fixed the lifecycle of the FDT folio preservation (now preserved once at init). This patch (of 13): The log message in kho_populate() currently states "Will skip init for some devices". This implies that Kexec Handover always involves skipping device initialization. However, KHO is a generic mechanism used to preserve kernel memory across reboot for various purposes, such as memfd, telemetry, or reserve_mem. Skipping device initialization is a specific property of live update drivers using KHO, not a property of the mechanism itself. Remove the misleading suffix to accurately reflect the generic nature of KHO discovery. Link: https://lkml.kernel.org/r/20251114190002.3311679-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 9f0913e101be..6ad45e12f53b 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1470,7 +1470,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; kho_scratch_cnt = scratch_cnt; - pr_info("found kexec handover data. Will skip init for some devices\n"); + pr_info("found kexec handover data.\n"); out: if (fdt) From 8c3819f627b74d44f928977413cfe55292eb809b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:51 -0500 Subject: [PATCH 097/139] kho: convert __kho_abort() to return void The internal helper __kho_abort() always returns 0 and has no failure paths. Its return value is ignored by __kho_finalize and checked needlessly by kho_abort. Change the return type to void to reflect that this function cannot fail, and simplify kho_abort by removing dead error handling code. Link: https://lkml.kernel.org/r/20251114190002.3311679-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 6ad45e12f53b..bc7f046a1313 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1117,20 +1117,16 @@ err_free_pages_array: } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -static int __kho_abort(void) +static void __kho_abort(void) { if (kho_out.preserved_mem_map) { kho_mem_ser_free(kho_out.preserved_mem_map); kho_out.preserved_mem_map = NULL; } - - return 0; } int kho_abort(void) { - int ret = 0; - if (!kho_enable) return -EOPNOTSUPP; @@ -1138,10 +1134,7 @@ int kho_abort(void) if (!kho_out.finalized) return -ENOENT; - ret = __kho_abort(); - if (ret) - return ret; - + __kho_abort(); kho_out.finalized = false; kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt); From 4c205677af2726bd3b51c02ab6a5a2b411efed09 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:52 -0500 Subject: [PATCH 098/139] kho: introduce high-level memory allocation API Currently, clients of KHO must manually allocate memory (e.g., via alloc_pages), calculate the page order, and explicitly call kho_preserve_folio(). Similarly, cleanup requires separate calls to unpreserve and free the memory. Introduce a high-level API to streamline this common pattern: - kho_alloc_preserve(size): Allocates physically contiguous, zeroed memory and immediately marks it for preservation. - kho_unpreserve_free(ptr): Unpreserves and frees the memory in the current kernel. - kho_restore_free(ptr): Restores the struct page state of preserved memory in the new kernel and immediately frees it to the page allocator. [pasha.tatashin@soleen.com: build fixes] Link: https://lkml.kernel.org/r/CA+CK2bBgXDhrHwTVgxrw7YTQ-0=LgW0t66CwPCgG=C85ftz4zw@mail.gmail.com Link: https://lkml.kernel.org/r/20251114190002.3311679-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 22 +++++--- kernel/liveupdate/kexec_handover.c | 87 ++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 7 deletions(-) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 80ece4232617..dde952227b88 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -2,8 +2,9 @@ #ifndef LINUX_KEXEC_HANDOVER_H #define LINUX_KEXEC_HANDOVER_H -#include +#include #include +#include struct kho_scratch { phys_addr_t addr; @@ -48,6 +49,9 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages); int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); +void *kho_alloc_preserve(size_t size); +void kho_unpreserve_free(void *mem); +void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); @@ -101,6 +105,14 @@ static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) return -EOPNOTSUPP; } +static inline void *kho_alloc_preserve(size_t size) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void kho_unpreserve_free(void *mem) { } +static inline void kho_restore_free(void *mem) { } + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; @@ -122,18 +134,14 @@ static inline int kho_add_subtree(const char *name, void *fdt) return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) -{ -} +static inline void kho_remove_subtree(void *fdt) { } static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } -static inline void kho_memory_init(void) -{ -} +static inline void kho_memory_init(void) { } static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index bc7f046a1313..5c5c9c46fe92 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -4,6 +4,7 @@ * Copyright (C) 2023 Alexander Graf * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport * Copyright (C) 2025 Google LLC, Changyuan Lyu + * Copyright (C) 2025 Pasha Tatashin */ #define pr_fmt(fmt) "KHO: " fmt @@ -1117,6 +1118,92 @@ err_free_pages_array: } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); +/** + * kho_alloc_preserve - Allocate, zero, and preserve memory. + * @size: The number of bytes to allocate. + * + * Allocates a physically contiguous block of zeroed pages that is large + * enough to hold @size bytes. The allocated memory is then registered with + * KHO for preservation across a kexec. + * + * Note: The actual allocated size will be rounded up to the nearest + * power-of-two page boundary. + * + * @return A virtual pointer to the allocated and preserved memory on success, + * or an ERR_PTR() encoded error on failure. + */ +void *kho_alloc_preserve(size_t size) +{ + struct folio *folio; + int order, ret; + + if (!size) + return ERR_PTR(-EINVAL); + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-E2BIG); + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); + if (!folio) + return ERR_PTR(-ENOMEM); + + ret = kho_preserve_folio(folio); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); + } + + return folio_address(folio); +} +EXPORT_SYMBOL_GPL(kho_alloc_preserve); + +/** + * kho_unpreserve_free - Unpreserve and free memory. + * @mem: Pointer to the memory allocated by kho_alloc_preserve(). + * + * Unregisters the memory from KHO preservation and frees the underlying + * pages back to the system. This function should be called to clean up + * memory allocated with kho_alloc_preserve(). + */ +void kho_unpreserve_free(void *mem) +{ + struct folio *folio; + + if (!mem) + return; + + folio = virt_to_folio(mem); + WARN_ON_ONCE(kho_unpreserve_folio(folio)); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_free); + +/** + * kho_restore_free - Restore and free memory after kexec. + * @mem: Pointer to the memory (in the new kernel's address space) + * that was allocated by the old kernel. + * + * This function is intended to be called in the new kernel (post-kexec) + * to take ownership of and free a memory region that was preserved by the + * old kernel using kho_alloc_preserve(). + * + * It first restores the pages from KHO (using their physical address) + * and then frees the pages back to the new kernel's page allocator. + */ +void kho_restore_free(void *mem) +{ + struct folio *folio; + + if (!mem) + return; + + folio = kho_restore_folio(__pa(mem)); + if (!WARN_ON(!folio)) + folio_put(folio); +} +EXPORT_SYMBOL_GPL(kho_restore_free); + static void __kho_abort(void) { if (kho_out.preserved_mem_map) { From 85de0090bd8256a94812f3be797b55bdbdcf78f5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:53 -0500 Subject: [PATCH 099/139] kho: preserve FDT folio only once during initialization Currently, the FDT folio is preserved inside __kho_finalize(). If the user performs multiple finalize/abort cycles, kho_preserve_folio() is called repeatedly for the same FDT folio. Since the FDT folio is allocated once during kho_init(), it should be marked for preservation at the same time. Move the preservation call to kho_init() to align the preservation state with the object's lifecycle and simplify the finalize path. Also, pre-zero the FDT tree so we do not expose random bits to the user and to the next kernel by using the new kho_alloc_preserve() api. Link: https://lkml.kernel.org/r/20251114190002.3311679-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 5c5c9c46fe92..704e91418214 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1251,10 +1251,6 @@ static int __kho_finalize(void) if (err) goto abort; - err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); - if (err) - goto abort; - err = kho_mem_serialize(&kho_out); if (err) goto abort; @@ -1384,19 +1380,17 @@ EXPORT_SYMBOL_GPL(kho_retrieve_subtree); static __init int kho_init(void) { - int err = 0; const void *fdt = kho_get_fdt(); - struct page *fdt_page; + int err = 0; if (!kho_enable) return 0; - fdt_page = alloc_page(GFP_KERNEL); - if (!fdt_page) { - err = -ENOMEM; + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(kho_out.fdt)) { + err = PTR_ERR(kho_out.fdt); goto err_free_scratch; } - kho_out.fdt = page_to_virt(fdt_page); err = kho_debugfs_init(); if (err) @@ -1424,9 +1418,9 @@ static __init int kho_init(void) return 0; err_free_fdt: - put_page(fdt_page); - kho_out.fdt = NULL; + kho_unpreserve_free(kho_out.fdt); err_free_scratch: + kho_out.fdt = NULL; for (int i = 0; i < kho_scratch_cnt; i++) { void *start = __va(kho_scratch[i].addr); void *end = start + kho_scratch[i].size; From 53f8f064eba344c074ef6755347bc4170538275f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:54 -0500 Subject: [PATCH 100/139] kho: verify deserialization status and fix FDT alignment access During boot, kho_restore_folio() relies on the memory map having been successfully deserialized. If deserialization fails or no map is present, attempting to restore the FDT folio is unsafe. Update kho_mem_deserialize() to return a boolean indicating success. Use this return value in kho_memory_init() to disable KHO if deserialization fails. Also, the incoming FDT folio is never used, there is no reason to restore it. Additionally, use get_unaligned() to retrieve the memory map pointer from the FDT. FDT properties are not guaranteed to be naturally aligned, and accessing a 64-bit value via a pointer that is only 32-bit aligned can cause faults. Link: https://lkml.kernel.org/r/20251114190002.3311679-6-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 704e91418214..bed611bae1df 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -451,20 +452,27 @@ static void __init deserialize_bitmap(unsigned int order, } } -static void __init kho_mem_deserialize(const void *fdt) +/* Return true if memory was deserizlied */ +static bool __init kho_mem_deserialize(const void *fdt) { struct khoser_mem_chunk *chunk; - const phys_addr_t *mem; + const void *mem_ptr; + u64 mem; int len; - mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); - - if (!mem || len != sizeof(*mem)) { + mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); - return; + return false; } - chunk = *mem ? phys_to_virt(*mem) : NULL; + mem = get_unaligned((const u64 *)mem_ptr); + chunk = mem ? phys_to_virt(mem) : NULL; + + /* No preserved physical pages were passed, no deserialization */ + if (!chunk) + return false; + while (chunk) { unsigned int i; @@ -473,6 +481,8 @@ static void __init kho_mem_deserialize(const void *fdt) &chunk->bitmaps[i]); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); } + + return true; } /* @@ -1458,16 +1468,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - struct folio *folio; - if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(kho_get_fdt()); - folio = kho_restore_folio(kho_in.fdt_phys); - if (!folio) - pr_warn("failed to restore folio for KHO fdt\n"); + if (!kho_mem_deserialize(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } From e268689a528288d5629ee017630186327403cc51 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:55 -0500 Subject: [PATCH 101/139] kho: always expose output FDT in debugfs Currently, the output FDT is added to debugfs only when KHO is finalized and removed when aborted. There is no need to hide the FDT based on the state. Always expose it starting from initialization. This aids the transition toward removing the explicit abort functionality and converting KHO to be fully stateless. Link: https://lkml.kernel.org/r/20251114190002.3311679-7-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index bed611bae1df..3e32c61a64b1 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1234,8 +1234,6 @@ int kho_abort(void) __kho_abort(); kho_out.finalized = false; - kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt); - return 0; } @@ -1306,9 +1304,6 @@ int kho_finalize(void) kho_out.finalized = true; - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - kho_out.fdt, true)); - return 0; } @@ -1425,6 +1420,9 @@ static __init int kho_init(void) init_cma_reserved_pageblock(pfn_to_page(pfn)); } + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + return 0; err_free_fdt: From 71960fe1344c432f8d67f6f9b379533496b89f8c Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:56 -0500 Subject: [PATCH 102/139] kho: simplify serialization and remove __kho_abort Currently, __kho_finalize() performs memory serialization in the middle of FDT construction. If FDT construction fails later, the function must manually clean up the serialized memory via __kho_abort(). Refactor __kho_finalize() to perform kho_mem_serialize() only after the FDT has been successfully constructed and finished. This reordering has two benefits: 1. It avoids expensive serialization work if FDT generation fails. 2. It removes the need for cleanup in the FDT error path. As a result, the internal helper __kho_abort() is no longer needed for internal error handling. Inline its remaining logic (cleanup of the preserved memory map) directly into kho_abort() and remove the helper. Link: https://lkml.kernel.org/r/20251114190002.3311679-8-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 41 +++++++++++++----------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 3e32c61a64b1..297136054f75 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1214,14 +1214,6 @@ void kho_restore_free(void *mem) } EXPORT_SYMBOL_GPL(kho_restore_free); -static void __kho_abort(void) -{ - if (kho_out.preserved_mem_map) { - kho_mem_ser_free(kho_out.preserved_mem_map); - kho_out.preserved_mem_map = NULL; - } -} - int kho_abort(void) { if (!kho_enable) @@ -1231,7 +1223,8 @@ int kho_abort(void) if (!kho_out.finalized) return -ENOENT; - __kho_abort(); + kho_mem_ser_free(kho_out.preserved_mem_map); + kho_out.preserved_mem_map = NULL; kho_out.finalized = false; return 0; @@ -1239,12 +1232,12 @@ int kho_abort(void) static int __kho_finalize(void) { - int err = 0; - u64 *preserved_mem_map; void *root = kho_out.fdt; struct kho_sub_fdt *fdt; + u64 *preserved_mem_map; + int err; - err |= fdt_create(root, PAGE_SIZE); + err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); @@ -1257,13 +1250,7 @@ static int __kho_finalize(void) sizeof(*preserved_mem_map), (void **)&preserved_mem_map); if (err) - goto abort; - - err = kho_mem_serialize(&kho_out); - if (err) - goto abort; - - *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); + goto err_exit; mutex_lock(&kho_out.fdts_lock); list_for_each_entry(fdt, &kho_out.sub_fdts, l) { @@ -1277,13 +1264,19 @@ static int __kho_finalize(void) err |= fdt_end_node(root); err |= fdt_finish(root); + if (err) + goto err_exit; -abort: - if (err) { - pr_err("Failed to convert KHO state tree: %d\n", err); - __kho_abort(); - } + err = kho_mem_serialize(&kho_out); + if (err) + goto err_exit; + *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); + + return 0; + +err_exit: + pr_err("Failed to convert KHO state tree: %d\n", err); return err; } From efa3a9775ac2a869788ac61fccad1efd26cd2d33 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:57 -0500 Subject: [PATCH 103/139] kho: remove global preserved_mem_map and store state in FDT Currently, the serialized memory map is tracked via kho_out.preserved_mem_map and copied to the FDT during finalization. This double tracking is redundant. Remove preserved_mem_map from kho_out. Instead, maintain the physical address of the head chunk directly in the preserved-memory-map FDT property. Introduce kho_update_memory_map() to manage this property. This function handles: 1. Retrieving and freeing any existing serialized map (handling the abort/retry case). 2. Updating the FDT property with the new chunk address. This establishes the FDT as the single source of truth for the handover state. Link: https://lkml.kernel.org/r/20251114190002.3311679-9-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 43 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 297136054f75..63800f63551f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -119,9 +119,6 @@ struct kho_out { struct mutex fdts_lock; struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; - struct kho_debugfs dbg; }; @@ -382,6 +379,27 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) } } +/* + * Update memory map property, if old one is found discard it via + * kho_mem_ser_free(). + */ +static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) +{ + void *ptr; + u64 phys; + + ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + + /* Check and discard previous memory map */ + phys = get_unaligned((u64 *)ptr); + if (phys) + kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); + + /* Update with the new value */ + phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; + put_unaligned(phys, (u64 *)ptr); +} + static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; @@ -422,7 +440,7 @@ static int kho_mem_serialize(struct kho_out *kho_out) } } - kho_out->preserved_mem_map = first_chunk; + kho_update_memory_map(first_chunk); return 0; @@ -1223,8 +1241,7 @@ int kho_abort(void) if (!kho_out.finalized) return -ENOENT; - kho_mem_ser_free(kho_out.preserved_mem_map); - kho_out.preserved_mem_map = NULL; + kho_update_memory_map(NULL); kho_out.finalized = false; return 0; @@ -1234,21 +1251,15 @@ static int __kho_finalize(void) { void *root = kho_out.fdt; struct kho_sub_fdt *fdt; - u64 *preserved_mem_map; + u64 empty_mem_map = 0; int err; err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - /** - * Reserve the preserved-memory-map property in the root FDT, so - * that all property definitions will precede subnodes created by - * KHO callers. - */ - err |= fdt_property_placeholder(root, PROP_PRESERVED_MEMORY_MAP, - sizeof(*preserved_mem_map), - (void **)&preserved_mem_map); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); if (err) goto err_exit; @@ -1271,8 +1282,6 @@ static int __kho_finalize(void) if (err) goto err_exit; - *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); - return 0; err_exit: From 9a4301f715c8c9a3aaab395b98ac1f3908015dcb Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:58 -0500 Subject: [PATCH 104/139] kho: remove abort functionality and support state refresh Previously, KHO required a dedicated kho_abort() function to clean up state before kho_finalize() could be called again. This was necessary to handle complex unwind paths when using notifiers. With the shift to direct memory preservation, the explicit abort step is no longer strictly necessary. Remove kho_abort() and refactor kho_finalize() to handle re-entry. If kho_finalize() is called while KHO is already finalized, it will now automatically clean up the previous memory map and state before generating a new one. This allows the KHO state to be updated/refreshed simply by triggering finalize again. Update debugfs to return -EINVAL if userspace attempts to write 0 to the finalize attribute, as explicit abort is no longer supported. Link: https://lkml.kernel.org/r/20251114190002.3311679-10-pasha.tatashin@soleen.com Suggested-by: Mike Rapoport (Microsoft) Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 21 ++++----------------- kernel/liveupdate/kexec_handover_debugfs.c | 2 +- kernel/liveupdate/kexec_handover_internal.h | 1 - 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 63800f63551f..624fd648d21f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1232,21 +1232,6 @@ void kho_restore_free(void *mem) } EXPORT_SYMBOL_GPL(kho_restore_free); -int kho_abort(void) -{ - if (!kho_enable) - return -EOPNOTSUPP; - - guard(mutex)(&kho_out.lock); - if (!kho_out.finalized) - return -ENOENT; - - kho_update_memory_map(NULL); - kho_out.finalized = false; - - return 0; -} - static int __kho_finalize(void) { void *root = kho_out.fdt; @@ -1297,8 +1282,10 @@ int kho_finalize(void) return -EOPNOTSUPP; guard(mutex)(&kho_out.lock); - if (kho_out.finalized) - return -EEXIST; + if (kho_out.finalized) { + kho_update_memory_map(NULL); + kho_out.finalized = false; + } ret = __kho_finalize(); if (ret) diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index ac739d25094d..2abbf62ba942 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -87,7 +87,7 @@ static int kho_out_finalize_set(void *data, u64 val) if (val) return kho_finalize(); else - return kho_abort(); + return -EINVAL; } DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 52ed73659fe6..0202c85ad14f 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -24,7 +24,6 @@ extern unsigned int kho_scratch_cnt; bool kho_finalized(void); int kho_finalize(void); -int kho_abort(void); #ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS int kho_debugfs_init(void); From 8e068a286aef7e772ec6e8bb4b82e8a5d4153b55 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:59 -0500 Subject: [PATCH 105/139] kho: update FDT dynamically for subtree addition/removal Currently, sub-FDTs were tracked in a list (kho_out.sub_fdts) and the final FDT is constructed entirely from scratch during kho_finalize(). We can maintain the FDT dynamically: 1. Initialize a valid, empty FDT in kho_init(). 2. Use fdt_add_subnode and fdt_setprop in kho_add_subtree to update the FDT immediately when a subsystem registers. 3. Use fdt_del_node in kho_remove_subtree to remove entries. This removes the need for the intermediate sub_fdts list and the reconstruction logic in kho_finalize(). kho_finalize() now only needs to trigger memory map serialization. Link: https://lkml.kernel.org/r/20251114190002.3311679-11-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 144 ++++++++++++++--------------- 1 file changed, 69 insertions(+), 75 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 624fd648d21f..461d96084c12 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -104,20 +104,11 @@ struct kho_mem_track { struct khoser_mem_chunk; -struct kho_sub_fdt { - struct list_head l; - const char *name; - void *fdt; -}; - struct kho_out { void *fdt; bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct list_head sub_fdts; - struct mutex fdts_lock; - struct kho_mem_track track; struct kho_debugfs dbg; }; @@ -127,8 +118,6 @@ static struct kho_out kho_out = { .track = { .orders = XARRAY_INIT(kho_out.track.orders, 0), }, - .sub_fdts = LIST_HEAD_INIT(kho_out.sub_fdts), - .fdts_lock = __MUTEX_INITIALIZER(kho_out.fdts_lock), .finalized = false, }; @@ -725,37 +714,67 @@ err_disable_kho: */ int kho_add_subtree(const char *name, void *fdt) { - struct kho_sub_fdt *sub_fdt; + phys_addr_t phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int err = -ENOMEM; + int off, fdt_err; - sub_fdt = kmalloc(sizeof(*sub_fdt), GFP_KERNEL); - if (!sub_fdt) - return -ENOMEM; + guard(mutex)(&kho_out.lock); - INIT_LIST_HEAD(&sub_fdt->l); - sub_fdt->name = name; - sub_fdt->fdt = fdt; + fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (fdt_err < 0) + return err; + + off = fdt_add_subnode(root_fdt, 0, name); + if (off < 0) { + if (off == -FDT_ERR_EXISTS) + err = -EEXIST; + goto out_pack; + } + + err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + if (err < 0) + goto out_pack; - guard(mutex)(&kho_out.fdts_lock); - list_add_tail(&sub_fdt->l, &kho_out.sub_fdts); WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); - return 0; +out_pack: + fdt_pack(root_fdt); + + return err; } EXPORT_SYMBOL_GPL(kho_add_subtree); void kho_remove_subtree(void *fdt) { - struct kho_sub_fdt *sub_fdt; + phys_addr_t target_phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int off; + int err; - guard(mutex)(&kho_out.fdts_lock); - list_for_each_entry(sub_fdt, &kho_out.sub_fdts, l) { - if (sub_fdt->fdt == fdt) { - list_del(&sub_fdt->l); - kfree(sub_fdt); + guard(mutex)(&kho_out.lock); + + err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (err < 0) + return; + + for (off = fdt_first_subnode(root_fdt, 0); off >= 0; + off = fdt_next_subnode(root_fdt, off)) { + const u64 *val; + int len; + + val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + if (!val || len != sizeof(phys_addr_t)) + continue; + + if ((phys_addr_t)*val == target_phys) { + fdt_del_node(root_fdt, off); kho_debugfs_fdt_remove(&kho_out.dbg, fdt); break; } } + + fdt_pack(root_fdt); } EXPORT_SYMBOL_GPL(kho_remove_subtree); @@ -1232,48 +1251,6 @@ void kho_restore_free(void *mem) } EXPORT_SYMBOL_GPL(kho_restore_free); -static int __kho_finalize(void) -{ - void *root = kho_out.fdt; - struct kho_sub_fdt *fdt; - u64 empty_mem_map = 0; - int err; - - err = fdt_create(root, PAGE_SIZE); - err |= fdt_finish_reservemap(root); - err |= fdt_begin_node(root, ""); - err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, - sizeof(empty_mem_map)); - if (err) - goto err_exit; - - mutex_lock(&kho_out.fdts_lock); - list_for_each_entry(fdt, &kho_out.sub_fdts, l) { - phys_addr_t phys = virt_to_phys(fdt->fdt); - - err |= fdt_begin_node(root, fdt->name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); - } - mutex_unlock(&kho_out.fdts_lock); - - err |= fdt_end_node(root); - err |= fdt_finish(root); - if (err) - goto err_exit; - - err = kho_mem_serialize(&kho_out); - if (err) - goto err_exit; - - return 0; - -err_exit: - pr_err("Failed to convert KHO state tree: %d\n", err); - return err; -} - int kho_finalize(void) { int ret; @@ -1282,12 +1259,7 @@ int kho_finalize(void) return -EOPNOTSUPP; guard(mutex)(&kho_out.lock); - if (kho_out.finalized) { - kho_update_memory_map(NULL); - kho_out.finalized = false; - } - - ret = __kho_finalize(); + ret = kho_mem_serialize(&kho_out); if (ret) return ret; @@ -1372,6 +1344,24 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); +static __init int kho_out_fdt_setup(void) +{ + void *root = kho_out.fdt; + u64 empty_mem_map = 0; + int err; + + err = fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); + err |= fdt_end_node(root); + err |= fdt_finish(root); + + return err; +} + static __init int kho_init(void) { const void *fdt = kho_get_fdt(); @@ -1394,6 +1384,10 @@ static __init int kho_init(void) if (err) goto err_free_fdt; + err = kho_out_fdt_setup(); + if (err) + goto err_free_fdt; + if (fdt) { kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; From d7255959b69a4e727c61eb04231d11390d4f391e Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 14:00:00 -0500 Subject: [PATCH 106/139] kho: allow kexec load before KHO finalization Currently, kho_fill_kimage() checks kho_out.finalized and returns early if KHO is not yet finalized. This enforces a strict ordering where userspace must finalize KHO *before* loading the kexec image. This is restrictive, as standard workflows often involve loading the target kernel early in the lifecycle and finalizing the state (FDT) only immediately before the reboot. Since the KHO FDT resides at a physical address allocated during boot (kho_init), its location is stable. We can attach this stable address to the kimage regardless of whether the content has been finalized yet. Relax the check to only require kho_enable, allowing kexec_file_load to proceed at any time. Link: https://lkml.kernel.org/r/20251114190002.3311679-12-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 461d96084c12..4596e67de832 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1550,7 +1550,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_out.finalized) + if (!kho_enable) return 0; image->kho.fdt = virt_to_phys(kho_out.fdt); From de51999e687c70a41997124b43291f84324c7924 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 14:00:01 -0500 Subject: [PATCH 107/139] kho: allow memory preservation state updates after finalization Currently, kho_preserve_* and kho_unpreserve_* return -EBUSY if KHO is finalized. This enforces a rigid "freeze" on the KHO memory state. With the introduction of re-entrant finalization, this restriction is no longer necessary. Users should be allowed to modify the preservation set (e.g., adding new pages or freeing old ones) even after an initial finalization. The intended workflow for updates is now: 1. Modify state (preserve/unpreserve). 2. Call kho_finalize() again to refresh the serialized metadata. Remove the kho_out.finalized checks to enable this dynamic behavior. This also allows to convert kho_unpreserve_* functions to void, as they do not return any error anymore. Link: https://lkml.kernel.org/r/20251114190002.3311679-13-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 21 ++++-------- kernel/liveupdate/kexec_handover.c | 55 +++++++----------------------- 2 files changed, 19 insertions(+), 57 deletions(-) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index dde952227b88..5f7b9de97e8d 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -44,11 +44,11 @@ bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); -int kho_unpreserve_folio(struct folio *folio); +void kho_unpreserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); -int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); +void kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); -int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); +void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); void *kho_alloc_preserve(size_t size); void kho_unpreserve_free(void *mem); void kho_restore_free(void *mem); @@ -79,20 +79,14 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; } -static inline int kho_unpreserve_folio(struct folio *folio) -{ - return -EOPNOTSUPP; -} +static inline void kho_unpreserve_folio(struct folio *folio) { } static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; } -static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) -{ - return -EOPNOTSUPP; -} +static inline void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) { } static inline int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) @@ -100,10 +94,7 @@ static inline int kho_preserve_vmalloc(void *ptr, return -EOPNOTSUPP; } -static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) -{ - return -EOPNOTSUPP; -} +static inline void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) { } static inline void *kho_alloc_preserve(size_t size) { diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 4596e67de832..a7f876ece445 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -185,10 +185,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; might_sleep(); - - if (kho_out.finalized) - return -EBUSY; - physxa = xa_load(&track->orders, order); if (!physxa) { int err; @@ -807,20 +803,14 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); * Instructs KHO to unpreserve a folio that was preserved by * kho_preserve_folio() before. The provided @folio (pfn and order) * must exactly match a previously preserved folio. - * - * Return: 0 on success, error code on failure */ -int kho_unpreserve_folio(struct folio *folio) +void kho_unpreserve_folio(struct folio *folio) { const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.track; - if (kho_out.finalized) - return -EBUSY; - __kho_unpreserve_order(track, pfn, order); - return 0; } EXPORT_SYMBOL_GPL(kho_unpreserve_folio); @@ -877,21 +867,14 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); * This must be called with the same @page and @nr_pages as the corresponding * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger * preserved blocks is not supported. - * - * Return: 0 on success, error code on failure */ -int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; - if (kho_out.finalized) - return -EBUSY; - __kho_unpreserve(track, start_pfn, end_pfn); - - return 0; } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); @@ -976,20 +959,6 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, } } -static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) -{ - struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); - - while (chunk) { - struct kho_vmalloc_chunk *tmp = chunk; - - kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); - - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - free_page((unsigned long)tmp); - } -} - /** * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec * @ptr: pointer to the area in vmalloc address space @@ -1051,7 +1020,7 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) return 0; err_free: - kho_vmalloc_free_chunks(preservation); + kho_unpreserve_vmalloc(preservation); return err; } EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); @@ -1062,17 +1031,19 @@ EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); * * Instructs KHO to unpreserve the area in vmalloc address space that was * previously preserved with kho_preserve_vmalloc(). - * - * Return: 0 on success, error code on failure */ -int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) { - if (kho_out.finalized) - return -EBUSY; + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); - kho_vmalloc_free_chunks(preservation); + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; - return 0; + kho_vmalloc_unpreserve_chunk(chunk, preservation->order); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } } EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); @@ -1221,7 +1192,7 @@ void kho_unpreserve_free(void *mem) return; folio = virt_to_folio(mem); - WARN_ON_ONCE(kho_unpreserve_folio(folio)); + kho_unpreserve_folio(folio); folio_put(folio); } EXPORT_SYMBOL_GPL(kho_unpreserve_free); From 7bd3643f94a357863beef646b85cf10292398629 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 14:00:02 -0500 Subject: [PATCH 108/139] kho: add Kconfig option to enable KHO by default Currently, Kexec Handover must be explicitly enabled via the kernel command line parameter `kho=on`. For workloads that rely on KHO as a foundational requirement (such as the upcoming Live Update Orchestrator), requiring an explicit boot parameter adds redundant configuration steps. Introduce CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT. When selected, KHO defaults to enabled. This is equivalent to passing kho=on at boot. The behavior can still be disabled at runtime by passing kho=off. Link: https://lkml.kernel.org/r/20251114190002.3311679-14-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/liveupdate/Kconfig | 14 ++++++++++++++ kernel/liveupdate/kexec_handover.c | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index eae428309332..a973a54447de 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -37,4 +37,18 @@ config KEXEC_HANDOVER_DEBUGFS Also, enables inspecting the KHO fdt trees with the debugfs binary blobs. +config KEXEC_HANDOVER_ENABLE_DEFAULT + bool "Enable kexec handover by default" + depends on KEXEC_HANDOVER + help + Enable Kexec Handover by default. This avoids the need to + explicitly pass 'kho=on' on the kernel command line. + + This is useful for systems where KHO is a prerequisite for other + features, such as Live Update, ensuring the mechanism is always + active. + + The default behavior can still be overridden at boot time by + passing 'kho=off'. + endmenu diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index a7f876ece445..224bdf5becb6 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -52,7 +52,7 @@ union kho_page_info { static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); -static bool kho_enable __ro_after_init; +static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); bool kho_is_enabled(void) { From 9e2fd062fa1713a33380cc97ef324d086dd45ba5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:31 -0500 Subject: [PATCH 109/139] liveupdate: luo_core: Live Update Orchestrator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Live Update Orchestrator", v8. This series introduces the Live Update Orchestrator, a kernel subsystem designed to facilitate live kernel updates using a kexec-based reboot. This capability is critical for cloud environments, allowing hypervisors to be updated with minimal downtime for running virtual machines. LUO achieves this by preserving the state of selected resources, such as memory, devices and their dependencies, across the kernel transition. As a key feature, this series includes support for preserving memfd file descriptors, which allows critical in-memory data, such as guest RAM or any other large memory region, to be maintained in RAM across the kexec reboot. The other series that use LUO, are VFIO [1], IOMMU [2], and PCI [3] preservations. Github repo of this series [4]. The core of LUO is a framework for managing the lifecycle of preserved resources through a userspace-driven interface. Key features include: - Session Management Userspace agent (i.e. luod [5]) creates named sessions, each represented by a file descriptor (via centralized agent that controls /dev/liveupdate). The lifecycle of all preserved resources within a session is tied to this FD, ensuring automatic kernel cleanup if the controlling userspace agent crashes or exits unexpectedly. - File Preservation A handler-based framework allows specific file types (demonstrated here with memfd) to be preserved. Handlers manage the serialization, restoration, and lifecycle of their specific file types. - File-Lifecycle-Bound State A new mechanism for managing shared global state whose lifecycle is tied to the preservation of one or more files. This is crucial for subsystems like IOMMU or HugeTLB, where multiple file descriptors may depend on a single, shared underlying resource that must be preserved only once. - KHO Integration LUO drives the Kexec Handover framework programmatically to pass its serialized metadata to the next kernel. The LUO state is finalized and added to the kexec image just before the reboot is triggered. In the future this step will also be removed once stateless KHO is merged [6]. - Userspace Interface Control is provided via ioctl commands on /dev/liveupdate for creating and retrieving sessions, as well as on session file descriptors for managing individual files. - Testing The series includes a set of selftests, including userspace API validation, kexec-based lifecycle tests for various session and file scenarios, and a new in-kernel test module to validate the FLB logic. Introduce LUO, a mechanism intended to facilitate kernel updates while keeping designated devices operational across the transition (e.g., via kexec). The primary use case is updating hypervisors with minimal disruption to running virtual machines. For userspace side of hypervisor update we have copyless migration. LUO is for updating the kernel. This initial patch lays the groundwork for the LUO subsystem. Further functionality, including the implementation of state transition logic, integration with KHO, and hooks for subsystems and file descriptors, will be added in subsequent patches. Create a character device at /dev/liveupdate. A new uAPI header, , will define the necessary structures. The magic number for IOCTL is registered in Documentation/userspace-api/ioctl/ioctl-number.rst. Link: https://lkml.kernel.org/r/20251125165850.3389713-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20251125165850.3389713-2-pasha.tatashin@soleen.com Link: https://lore.kernel.org/all/20251018000713.677779-1-vipinsh@google.com/ [1] Link: https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@google.com [2] Link: https://lore.kernel.org/linux-pci/20250916-luo-pci-v2-0-c494053c3c08@kernel.org [3] Link: https://github.com/googleprodkernel/linux-liveupdate/tree/luo/v8 [4] Link: https://tinyurl.com/luoddesign [5] Link: https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com [6] Link: https://lore.kernel.org/all/20251115233409.768044-1-pasha.tatashin@soleen.com [7] Link: https://github.com/soleen/linux/blob/luo/v8b03/diff.v7.v8 [8] Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zijun Hu Cc: Pratyush Yadav Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- .../userspace-api/ioctl/ioctl-number.rst | 2 + include/linux/liveupdate.h | 35 ++++++ include/uapi/linux/liveupdate.h | 46 ++++++++ kernel/liveupdate/Kconfig | 21 ++++ kernel/liveupdate/Makefile | 5 + kernel/liveupdate/luo_core.c | 111 ++++++++++++++++++ 6 files changed, 220 insertions(+) create mode 100644 include/linux/liveupdate.h create mode 100644 include/uapi/linux/liveupdate.h create mode 100644 kernel/liveupdate/luo_core.c diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 7c527a01d1cf..7232b3544cec 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -385,6 +385,8 @@ Code Seq# Include File Comments 0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver 0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver +0xBA 00-0F uapi/linux/liveupdate.h Pasha Tatashin + 0xC0 00-0F linux/usb/iowarrior.h 0xCA 00-0F uapi/misc/cxl.h Dead since 6.15 0xCA 10-2F uapi/misc/ocxl.h diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h new file mode 100644 index 000000000000..c6a1d6bd90cb --- /dev/null +++ b/include/linux/liveupdate.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ +#ifndef _LINUX_LIVEUPDATE_H +#define _LINUX_LIVEUPDATE_H + +#include +#include +#include + +#ifdef CONFIG_LIVEUPDATE + +/* Return true if live update orchestrator is enabled */ +bool liveupdate_enabled(void); + +/* Called during kexec to tell LUO that entered into reboot */ +int liveupdate_reboot(void); + +#else /* CONFIG_LIVEUPDATE */ + +static inline bool liveupdate_enabled(void) +{ + return false; +} + +static inline int liveupdate_reboot(void) +{ + return 0; +} + +#endif /* CONFIG_LIVEUPDATE */ +#endif /* _LINUX_LIVEUPDATE_H */ diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h new file mode 100644 index 000000000000..df34c1642c4d --- /dev/null +++ b/include/uapi/linux/liveupdate.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +/* + * Userspace interface for /dev/liveupdate + * Live Update Orchestrator + * + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +#ifndef _UAPI_LIVEUPDATE_H +#define _UAPI_LIVEUPDATE_H + +#include +#include + +/** + * DOC: General ioctl format + * + * The ioctl interface follows a general format to allow for extensibility. Each + * ioctl is passed in a structure pointer as the argument providing the size of + * the structure in the first u32. The kernel checks that any structure space + * beyond what it understands is 0. This allows userspace to use the backward + * compatible portion while consistently using the newer, larger, structures. + * + * ioctls use a standard meaning for common errnos: + * + * - ENOTTY: The IOCTL number itself is not supported at all + * - E2BIG: The IOCTL number is supported, but the provided structure has + * non-zero in a part the kernel does not understand. + * - EOPNOTSUPP: The IOCTL number is supported, and the structure is + * understood, however a known field has a value the kernel does not + * understand or support. + * - EINVAL: Everything about the IOCTL was understood, but a field is not + * correct. + * - ENOENT: A provided token does not exist. + * - ENOMEM: Out of memory. + * - EOVERFLOW: Mathematics overflowed. + * + * As well as additional errnos, within specific ioctls. + */ + +/* The ioctl type, documented in ioctl-number.rst */ +#define LIVEUPDATE_IOCTL_TYPE 0xBA + +#endif /* _UAPI_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index a973a54447de..9b2515f31afb 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -51,4 +51,25 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT The default behavior can still be overridden at boot time by passing 'kho=off'. +config LIVEUPDATE + bool "Live Update Orchestrator" + depends on KEXEC_HANDOVER + help + Enable the Live Update Orchestrator. Live Update is a mechanism, + typically based on kexec, that allows the kernel to be updated + while keeping selected devices operational across the transition. + These devices are intended to be reclaimed by the new kernel and + re-attached to their original workload without requiring a device + reset. + + Ability to handover a device from current to the next kernel depends + on specific support within device drivers and related kernel + subsystems. + + This feature primarily targets virtual machine hosts to quickly update + the kernel hypervisor with minimal disruption to the running virtual + machines. + + If unsure, say N. + endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index f52ce1ebcf86..08954c1770c4 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -1,5 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 +luo-y := \ + luo_core.o + obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o + +obj-$(CONFIG_LIVEUPDATE) += luo.o diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c new file mode 100644 index 000000000000..30ad8836360b --- /dev/null +++ b/kernel/liveupdate/luo_core.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/** + * DOC: Live Update Orchestrator (LUO) + * + * Live Update is a specialized, kexec-based reboot process that allows a + * running kernel to be updated from one version to another while preserving + * the state of selected resources and keeping designated hardware devices + * operational. For these devices, DMA activity may continue throughout the + * kernel transition. + * + * While the primary use case driving this work is supporting live updates of + * the Linux kernel when it is used as a hypervisor in cloud environments, the + * LUO framework itself is designed to be workload-agnostic. Live Update + * facilitates a full kernel version upgrade for any type of system. + * + * For example, a non-hypervisor system running an in-memory cache like + * memcached with many gigabytes of data can use LUO. The userspace service + * can place its cache into a memfd, have its state preserved by LUO, and + * restore it immediately after the kernel kexec. + * + * Whether the system is running virtual machines, containers, a + * high-performance database, or networking services, LUO's primary goal is to + * enable a full kernel update by preserving critical userspace state and + * keeping essential devices operational. + * + * The core of LUO is a mechanism that tracks the progress of a live update, + * along with a callback API that allows other kernel subsystems to participate + * in the process. Example subsystems that can hook into LUO include: kvm, + * iommu, interrupts, vfio, participating filesystems, and memory management. + * + * LUO uses Kexec Handover to transfer memory state from the current kernel to + * the next kernel. For more details see + * Documentation/core-api/kho/concepts.rst. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static struct { + bool enabled; +} luo_global; + +static int __init early_liveupdate_param(char *buf) +{ + return kstrtobool(buf, &luo_global.enabled); +} +early_param("liveupdate", early_liveupdate_param); + +/* Public Functions */ + +/** + * liveupdate_reboot() - Kernel reboot notifier for live update final + * serialization. + * + * This function is invoked directly from the reboot() syscall pathway + * if kexec is in progress. + * + * If any callback fails, this function aborts KHO, undoes the freeze() + * callbacks, and returns an error. + */ +int liveupdate_reboot(void) +{ + return 0; +} + +/** + * liveupdate_enabled - Check if the live update feature is enabled. + * + * This function returns the state of the live update feature flag, which + * can be controlled via the ``liveupdate`` kernel command-line parameter. + * + * @return true if live update is enabled, false otherwise. + */ +bool liveupdate_enabled(void) +{ + return luo_global.enabled; +} + +struct luo_device_state { + struct miscdevice miscdev; +}; + +static const struct file_operations luo_fops = { + .owner = THIS_MODULE, +}; + +static struct luo_device_state luo_dev = { + .miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "liveupdate", + .fops = &luo_fops, + }, +}; + +static int __init liveupdate_ioctl_init(void) +{ + if (!liveupdate_enabled()) + return 0; + + return misc_register(&luo_dev.miscdev); +} +late_initcall(liveupdate_ioctl_init); From 1aece821004f67f46ef4db7199bbeca87cf22bdd Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:32 -0500 Subject: [PATCH 110/139] liveupdate: luo_core: integrate with KHO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrate the LUO with the KHO framework to enable passing LUO state across a kexec reboot. This patch implements the lifecycle integration with KHO: 1. Incoming State: During early boot (`early_initcall`), LUO checks if KHO is active. If so, it retrieves the "LUO" subtree, verifies the "luo-v1" compatibility string, and reads the `liveupdate-number` to track the update count. 2. Outgoing State: During late initialization (`late_initcall`), LUO allocates a new FDT for the next kernel, populates it with the basic header (compatible string and incremented update number), and registers it with KHO (`kho_add_subtree`). 3. Finalization: The `liveupdate_reboot()` notifier is updated to invoke `kho_finalize()`. This ensures that all memory segments marked for preservation are properly serialized before the kexec jump. Link: https://lkml.kernel.org/r/20251125165850.3389713-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Tested-by: David Matlack Reviewed-by: Mike Rapoport (Microsoft) Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 58 ++++++++++++ kernel/liveupdate/luo_core.c | 154 ++++++++++++++++++++++++++++++- kernel/liveupdate/luo_internal.h | 22 +++++ 3 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 include/linux/kho/abi/luo.h create mode 100644 kernel/liveupdate/luo_internal.h diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h new file mode 100644 index 000000000000..2099b51929e5 --- /dev/null +++ b/include/linux/kho/abi/luo.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/** + * DOC: Live Update Orchestrator ABI + * + * This header defines the stable Application Binary Interface used by the + * Live Update Orchestrator to pass state from a pre-update kernel to a + * post-update kernel. The ABI is built upon the Kexec HandOver framework + * and uses a Flattened Device Tree to describe the preserved data. + * + * This interface is a contract. Any modification to the FDT structure, node + * properties, compatible strings, or the layout of the `__packed` serialization + * structures defined here constitutes a breaking change. Such changes require + * incrementing the version number in the relevant `_COMPATIBLE` string to + * prevent a new kernel from misinterpreting data from an old kernel. + * + * Changes are allowed provided the compatibility version is incremented; + * however, backward/forward compatibility is only guaranteed for kernels + * supporting the same ABI version. + * + * FDT Structure Overview: + * The entire LUO state is encapsulated within a single KHO entry named "LUO". + * This entry contains an FDT with the following layout: + * + * .. code-block:: none + * + * / { + * compatible = "luo-v1"; + * liveupdate-number = <...>; + * }; + * + * Main LUO Node (/): + * + * - compatible: "luo-v1" + * Identifies the overall LUO ABI version. + * - liveupdate-number: u64 + * A counter tracking the number of successful live updates performed. + */ + +#ifndef _LINUX_KHO_ABI_LUO_H +#define _LINUX_KHO_ABI_LUO_H + +/* + * The LUO FDT hooks all LUO state for sessions, fds, etc. + * In the root it also carries "liveupdate-number" 64-bit property that + * corresponds to the number of live-updates performed on this machine. + */ +#define LUO_FDT_SIZE PAGE_SIZE +#define LUO_FDT_KHO_ENTRY_NAME "LUO" +#define LUO_FDT_COMPATIBLE "luo-v1" +#define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number" + +#endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 30ad8836360b..9f9fe9a81b29 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -41,12 +41,26 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include #include +#include #include #include +#include +#include +#include +#include + +#include "kexec_handover_internal.h" +#include "luo_internal.h" static struct { bool enabled; + void *fdt_out; + void *fdt_in; + u64 liveupdate_num; } luo_global; static int __init early_liveupdate_param(char *buf) @@ -55,6 +69,129 @@ static int __init early_liveupdate_param(char *buf) } early_param("liveupdate", early_liveupdate_param); +static int __init luo_early_startup(void) +{ + phys_addr_t fdt_phys; + int err, ln_size; + const void *ptr; + + if (!kho_is_enabled()) { + if (liveupdate_enabled()) + pr_warn("Disabling liveupdate because KHO is disabled\n"); + luo_global.enabled = false; + return 0; + } + + /* Retrieve LUO subtree, and verify its format. */ + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + if (err) { + if (err != -ENOENT) { + pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", + LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err)); + return err; + } + + return 0; + } + + luo_global.fdt_in = phys_to_virt(fdt_phys); + err = fdt_node_check_compatible(luo_global.fdt_in, 0, + LUO_FDT_COMPATIBLE); + if (err) { + pr_err("FDT '%s' is incompatible with '%s' [%d]\n", + LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err); + + return -EINVAL; + } + + ln_size = 0; + ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM, + &ln_size); + if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) { + pr_err("Unable to get live update number '%s' [%d]\n", + LUO_FDT_LIVEUPDATE_NUM, ln_size); + + return -EINVAL; + } + + luo_global.liveupdate_num = get_unaligned((u64 *)ptr); + pr_info("Retrieved live update data, liveupdate number: %lld\n", + luo_global.liveupdate_num); + + return 0; +} + +static int __init liveupdate_early_init(void) +{ + int err; + + err = luo_early_startup(); + if (err) { + luo_global.enabled = false; + luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n", + ERR_PTR(err)); + } + + return err; +} +early_initcall(liveupdate_early_init); + +/* Called during boot to create outgoing LUO fdt tree */ +static int __init luo_fdt_setup(void) +{ + const u64 ln = luo_global.liveupdate_num + 1; + void *fdt_out; + int err; + + fdt_out = kho_alloc_preserve(LUO_FDT_SIZE); + if (IS_ERR(fdt_out)) { + pr_err("failed to allocate/preserve FDT memory\n"); + return PTR_ERR(fdt_out); + } + + err = fdt_create(fdt_out, LUO_FDT_SIZE); + err |= fdt_finish_reservemap(fdt_out); + err |= fdt_begin_node(fdt_out, ""); + err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); + err |= fdt_end_node(fdt_out); + err |= fdt_finish(fdt_out); + if (err) + goto exit_free; + + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + if (err) + goto exit_free; + luo_global.fdt_out = fdt_out; + + return 0; + +exit_free: + kho_unpreserve_free(fdt_out); + pr_err("failed to prepare LUO FDT: %d\n", err); + + return err; +} + +/* + * late initcall because it initializes the outgoing tree that is needed only + * once userspace starts using /dev/liveupdate. + */ +static int __init luo_late_startup(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_fdt_setup(); + if (err) + luo_global.enabled = false; + + return err; +} +late_initcall(luo_late_startup); + /* Public Functions */ /** @@ -69,7 +206,22 @@ early_param("liveupdate", early_liveupdate_param); */ int liveupdate_reboot(void) { - return 0; + int err; + + if (!liveupdate_enabled()) + return 0; + + err = kho_finalize(); + if (err) { + pr_err("kho_finalize failed %d\n", err); + /* + * kho_finalize() may return libfdt errors, to aboid passing to + * userspace unknown errors, change this to EAGAIN. + */ + err = -EAGAIN; + } + + return err; } /** diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h new file mode 100644 index 000000000000..8612687b2000 --- /dev/null +++ b/kernel/liveupdate/luo_internal.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +#ifndef _LINUX_LUO_INTERNAL_H +#define _LINUX_LUO_INTERNAL_H + +#include + +/* + * Handles a deserialization failure: devices and memory is in unpredictable + * state. + * + * Continuing the boot process after a failure is dangerous because it could + * lead to leaks of private data. + */ +#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) + +#endif /* _LINUX_LUO_INTERNAL_H */ From db8bed8082dc6185153cdef67cdd5aa7526e8126 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:33 -0500 Subject: [PATCH 111/139] kexec: call liveupdate_reboot() before kexec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify the kernel_kexec() to call liveupdate_reboot(). This ensures that the Live Update Orchestrator is notified just before the kernel executes the kexec jump. The liveupdate_reboot() function triggers the final freeze event, allowing participating FDs perform last-minute check or state saving within the blackout window. If liveupdate_reboot() returns an error (indicating a failure during LUO finalization), the kexec operation is aborted to prevent proceeding with an inconsistent state. An error is returned to user. Link: https://lkml.kernel.org/r/20251125165850.3389713-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5ed7a2383d5d..c4b4996ff1d1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1137,6 +1138,10 @@ int kernel_kexec(void) goto Unlock; } + error = liveupdate_reboot(); + if (error) + goto Unlock; + #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* From 0153094d03df5a2e834a19c59b255649a258ae46 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:34 -0500 Subject: [PATCH 112/139] liveupdate: luo_session: add sessions support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce concept of "Live Update Sessions" within the LUO framework. LUO sessions provide a mechanism to group and manage `struct file *` instances (representing file descriptors) that need to be preserved across a kexec-based live update. Each session is identified by a unique name and acts as a container for file objects whose state is critical to a userspace workload, such as a virtual machine or a high-performance database, aiming to maintain their functionality across a kernel transition. This groundwork establishes the framework for preserving file-backed state across kernel updates, with the actual file data preservation mechanisms to be implemented in subsequent patches. [dan.carpenter@linaro.org: fix use after free in luo_session_deserialize()] Link: https://lkml.kernel.org/r/c5dd637d7eed3a3be48c5e9fedb881596a3b1f5a.1764163896.git.dan.carpenter@linaro.org Link: https://lkml.kernel.org/r/20251125165850.3389713-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Signed-off-by: Dan Carpenter Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 71 +++++ include/uapi/linux/liveupdate.h | 3 + kernel/liveupdate/Makefile | 3 +- kernel/liveupdate/luo_core.c | 9 + kernel/liveupdate/luo_internal.h | 29 ++ kernel/liveupdate/luo_session.c | 463 +++++++++++++++++++++++++++++++ 6 files changed, 577 insertions(+), 1 deletion(-) create mode 100644 kernel/liveupdate/luo_session.c diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index 2099b51929e5..bf1ab2910959 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -32,6 +32,11 @@ * / { * compatible = "luo-v1"; * liveupdate-number = <...>; + * + * luo-session { + * compatible = "luo-session-v1"; + * luo-session-header = ; + * }; * }; * * Main LUO Node (/): @@ -40,11 +45,37 @@ * Identifies the overall LUO ABI version. * - liveupdate-number: u64 * A counter tracking the number of successful live updates performed. + * + * Session Node (luo-session): + * This node describes all preserved user-space sessions. + * + * - compatible: "luo-session-v1" + * Identifies the session ABI version. + * - luo-session-header: u64 + * The physical address of a `struct luo_session_header_ser`. This structure + * is the header for a contiguous block of memory containing an array of + * `struct luo_session_ser`, one for each preserved session. + * + * Serialization Structures: + * The FDT properties point to memory regions containing arrays of simple, + * `__packed` structures. These structures contain the actual preserved state. + * + * - struct luo_session_header_ser: + * Header for the session array. Contains the total page count of the + * preserved memory block and the number of `struct luo_session_ser` + * entries that follow. + * + * - struct luo_session_ser: + * Metadata for a single session, including its name and a physical pointer + * to another preserved memory block containing an array of + * `struct luo_file_ser` for all files in that session. */ #ifndef _LINUX_KHO_ABI_LUO_H #define _LINUX_KHO_ABI_LUO_H +#include + /* * The LUO FDT hooks all LUO state for sessions, fds, etc. * In the root it also carries "liveupdate-number" 64-bit property that @@ -55,4 +86,44 @@ #define LUO_FDT_COMPATIBLE "luo-v1" #define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number" +/* + * LUO FDT session node + * LUO_FDT_SESSION_HEADER: is a u64 physical address of struct + * luo_session_header_ser + */ +#define LUO_FDT_SESSION_NODE_NAME "luo-session" +#define LUO_FDT_SESSION_COMPATIBLE "luo-session-v1" +#define LUO_FDT_SESSION_HEADER "luo-session-header" + +/** + * struct luo_session_header_ser - Header for the serialized session data block. + * @count: The number of `struct luo_session_ser` entries that immediately + * follow this header in the memory block. + * + * This structure is located at the beginning of a contiguous block of + * physical memory preserved across the kexec. It provides the necessary + * metadata to interpret the array of session entries that follow. + * + * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated. + */ +struct luo_session_header_ser { + u64 count; +} __packed; + +/** + * struct luo_session_ser - Represents the serialized metadata for a LUO session. + * @name: The unique name of the session, provided by the userspace at + * the time of session creation. + * + * This structure is used to package session-specific metadata for transfer + * between kernels via Kexec Handover. An array of these structures (one per + * session) is created and passed to the new kernel, allowing it to reconstruct + * the session context. + * + * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated. + */ +struct luo_session_ser { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; +} __packed; + #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h index df34c1642c4d..40578ae19668 100644 --- a/include/uapi/linux/liveupdate.h +++ b/include/uapi/linux/liveupdate.h @@ -43,4 +43,7 @@ /* The ioctl type, documented in ioctl-number.rst */ #define LIVEUPDATE_IOCTL_TYPE 0xBA +/* The maximum length of session name including null termination */ +#define LIVEUPDATE_SESSION_NAME_LENGTH 64 + #endif /* _UAPI_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index 08954c1770c4..6af93caa58cf 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 luo-y := \ - luo_core.o + luo_core.o \ + luo_session.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 9f9fe9a81b29..a0f7788cd003 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -118,6 +118,10 @@ static int __init luo_early_startup(void) pr_info("Retrieved live update data, liveupdate number: %lld\n", luo_global.liveupdate_num); + err = luo_session_setup_incoming(luo_global.fdt_in); + if (err) + return err; + return 0; } @@ -154,6 +158,7 @@ static int __init luo_fdt_setup(void) err |= fdt_begin_node(fdt_out, ""); err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); + err |= luo_session_setup_outgoing(fdt_out); err |= fdt_end_node(fdt_out); err |= fdt_finish(fdt_out); if (err) @@ -211,6 +216,10 @@ int liveupdate_reboot(void) if (!liveupdate_enabled()) return 0; + err = luo_session_serialize(); + if (err) + return err; + err = kho_finalize(); if (err) { pr_err("kho_finalize failed %d\n", err); diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 8612687b2000..05ae91695ec6 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -19,4 +19,33 @@ */ #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) +/** + * struct luo_session - Represents an active or incoming Live Update session. + * @name: A unique name for this session, used for identification and + * retrieval. + * @ser: Pointer to the serialized data for this session. + * @list: A list_head member used to link this session into a global list + * of either outgoing (to be preserved) or incoming (restored from + * previous kernel) sessions. + * @retrieved: A boolean flag indicating whether this session has been + * retrieved by a consumer in the new kernel. + * @mutex: protects fields in the luo_session. + */ +struct luo_session { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + struct luo_session_ser *ser; + struct list_head list; + bool retrieved; + struct mutex mutex; +}; + +int luo_session_create(const char *name, struct file **filep); +int luo_session_retrieve(const char *name, struct file **filep); +int __init luo_session_setup_outgoing(void *fdt); +int __init luo_session_setup_incoming(void *fdt); +int luo_session_serialize(void); +int luo_session_deserialize(void); +bool luo_session_quiesce(void); +void luo_session_resume(void); + #endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c new file mode 100644 index 000000000000..3a031446d3a4 --- /dev/null +++ b/kernel/liveupdate/luo_session.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/** + * DOC: LUO Sessions + * + * LUO Sessions provide the core mechanism for grouping and managing `struct + * file *` instances that need to be preserved across a kexec-based live + * update. Each session acts as a named container for a set of file objects, + * allowing a userspace agent to manage the lifecycle of resources critical to a + * workload. + * + * Core Concepts: + * + * - Named Containers: Sessions are identified by a unique, user-provided name, + * which is used for both creation in the current kernel and retrieval in the + * next kernel. + * + * - Userspace Interface: Session management is driven from userspace via + * ioctls on /dev/liveupdate. + * + * - Serialization: Session metadata is preserved using the KHO framework. When + * a live update is triggered via kexec, an array of `struct luo_session_ser` + * is populated and placed in a preserved memory region. An FDT node is also + * created, containing the count of sessions and the physical address of this + * array. + * + * Session Lifecycle: + * + * 1. Creation: A userspace agent calls `luo_session_create()` to create a + * new, empty session and receives a file descriptor for it. + * + * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is + * made, `luo_session_serialize()` is called. It iterates through all + * active sessions and writes their metadata into a memory area preserved + * by KHO. + * + * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()` + * runs, reading the serialized data and creating a list of `struct + * luo_session` objects representing the preserved sessions. + * + * 4. Retrieval: A userspace agent in the new kernel can then call + * `luo_session_retrieve()` with a session name to get a new file + * descriptor and access the preserved state. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "luo_internal.h" + +/* 16 4K pages, give space for 744 sessions */ +#define LUO_SESSION_PGCNT 16ul +#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_session_header_ser)) / \ + sizeof(struct luo_session_ser)) + +/** + * struct luo_session_header - Header struct for managing LUO sessions. + * @count: The number of sessions currently tracked in the @list. + * @list: The head of the linked list of `struct luo_session` instances. + * @rwsem: A read-write semaphore providing synchronized access to the + * session list and other fields in this structure. + * @header_ser: The header data of serialization array. + * @ser: The serialized session data (an array of + * `struct luo_session_ser`). + * @active: Set to true when first initialized. If previous kernel did not + * send session data, active stays false for incoming. + */ +struct luo_session_header { + long count; + struct list_head list; + struct rw_semaphore rwsem; + struct luo_session_header_ser *header_ser; + struct luo_session_ser *ser; + bool active; +}; + +/** + * struct luo_session_global - Global container for managing LUO sessions. + * @incoming: The sessions passed from the previous kernel. + * @outgoing: The sessions that are going to be passed to the next kernel. + */ +struct luo_session_global { + struct luo_session_header incoming; + struct luo_session_header outgoing; +}; + +static struct luo_session_global luo_session_global = { + .incoming = { + .list = LIST_HEAD_INIT(luo_session_global.incoming.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem), + }, + .outgoing = { + .list = LIST_HEAD_INIT(luo_session_global.outgoing.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem), + }, +}; + +static struct luo_session *luo_session_alloc(const char *name) +{ + struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL); + + if (!session) + return ERR_PTR(-ENOMEM); + + strscpy(session->name, name, sizeof(session->name)); + INIT_LIST_HEAD(&session->list); + mutex_init(&session->mutex); + + return session; +} + +static void luo_session_free(struct luo_session *session) +{ + mutex_destroy(&session->mutex); + kfree(session); +} + +static int luo_session_insert(struct luo_session_header *sh, + struct luo_session *session) +{ + struct luo_session *it; + + guard(rwsem_write)(&sh->rwsem); + + /* + * For outgoing we should make sure there is room in serialization array + * for new session. + */ + if (sh == &luo_session_global.outgoing) { + if (sh->count == LUO_SESSION_MAX) + return -ENOMEM; + } + + /* + * For small number of sessions this loop won't hurt performance + * but if we ever start using a lot of sessions, this might + * become a bottle neck during deserialization time, as it would + * cause O(n*n) complexity. + */ + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, session->name, sizeof(it->name))) + return -EEXIST; + } + list_add_tail(&session->list, &sh->list); + sh->count++; + + return 0; +} + +static void luo_session_remove(struct luo_session_header *sh, + struct luo_session *session) +{ + guard(rwsem_write)(&sh->rwsem); + list_del(&session->list); + sh->count--; +} + +static int luo_session_release(struct inode *inodep, struct file *filep) +{ + struct luo_session *session = filep->private_data; + struct luo_session_header *sh; + + /* If retrieved is set, it means this session is from incoming list */ + if (session->retrieved) + sh = &luo_session_global.incoming; + else + sh = &luo_session_global.outgoing; + + luo_session_remove(sh, session); + luo_session_free(session); + + return 0; +} + +static const struct file_operations luo_session_fops = { + .owner = THIS_MODULE, + .release = luo_session_release, +}; + +/* Create a "struct file" for session */ +static int luo_session_getfile(struct luo_session *session, struct file **filep) +{ + char name_buf[128]; + struct file *file; + + lockdep_assert_held(&session->mutex); + snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name); + file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR); + if (IS_ERR(file)) + return PTR_ERR(file); + + *filep = file; + + return 0; +} + +int luo_session_create(const char *name, struct file **filep) +{ + struct luo_session *session; + int err; + + session = luo_session_alloc(name); + if (IS_ERR(session)) + return PTR_ERR(session); + + err = luo_session_insert(&luo_session_global.outgoing, session); + if (err) + goto err_free; + + scoped_guard(mutex, &session->mutex) + err = luo_session_getfile(session, filep); + if (err) + goto err_remove; + + return 0; + +err_remove: + luo_session_remove(&luo_session_global.outgoing, session); +err_free: + luo_session_free(session); + + return err; +} + +int luo_session_retrieve(const char *name, struct file **filep) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + struct luo_session *session = NULL; + struct luo_session *it; + int err; + + scoped_guard(rwsem_read, &sh->rwsem) { + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, name, sizeof(it->name))) { + session = it; + break; + } + } + } + + if (!session) + return -ENOENT; + + guard(mutex)(&session->mutex); + if (session->retrieved) + return -EINVAL; + + err = luo_session_getfile(session, filep); + if (!err) + session->retrieved = true; + + return err; +} + +int __init luo_session_setup_outgoing(void *fdt_out) +{ + struct luo_session_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_SESSION_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + luo_session_global.outgoing.header_ser = header_ser; + luo_session_global.outgoing.ser = (void *)(header_ser + 1); + luo_session_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + return err; +} + +int __init luo_session_setup_incoming(void *fdt_in) +{ + struct luo_session_header_ser *header_ser; + int err, header_size, offset; + u64 header_ser_pa; + const void *ptr; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get session node: [%s]\n", + LUO_FDT_SESSION_NODE_NAME); + return -EINVAL; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_SESSION_COMPATIBLE); + if (err) { + pr_err("Session node incompatible [%s]\n", + LUO_FDT_SESSION_COMPATIBLE); + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get session header '%s' [%d]\n", + LUO_FDT_SESSION_HEADER, header_size); + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_session_global.incoming.header_ser = header_ser; + luo_session_global.incoming.ser = (void *)(header_ser + 1); + luo_session_global.incoming.active = true; + + return 0; +} + +int luo_session_deserialize(void) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + static bool is_deserialized; + static int err; + + /* If has been deserialized, always return the same error code */ + if (is_deserialized) + return err; + + is_deserialized = true; + if (!sh->active) + return 0; + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of sessions that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + for (int i = 0; i < sh->header_ser->count; i++) { + struct luo_session *session; + + session = luo_session_alloc(sh->ser[i].name); + if (IS_ERR(session)) { + pr_warn("Failed to allocate session [%s] during deserialization %pe\n", + sh->ser[i].name, session); + return PTR_ERR(session); + } + + err = luo_session_insert(sh, session); + if (err) { + pr_warn("Failed to insert session [%s] %pe\n", + session->name, ERR_PTR(err)); + luo_session_free(session); + return err; + } + } + + kho_restore_free(sh->header_ser); + sh->header_ser = NULL; + sh->ser = NULL; + + return 0; +} + +int luo_session_serialize(void) +{ + struct luo_session_header *sh = &luo_session_global.outgoing; + struct luo_session *session; + int i = 0; + + guard(rwsem_write)(&sh->rwsem); + list_for_each_entry(session, &sh->list, list) { + strscpy(sh->ser[i].name, session->name, + sizeof(sh->ser[i].name)); + i++; + } + sh->header_ser->count = sh->count; + + return 0; +} + +/** + * luo_session_quiesce - Ensure no active sessions exist and lock session lists. + * + * Acquires exclusive write locks on both incoming and outgoing session lists. + * It then validates no sessions exist in either list. + * + * This mechanism is used during file handler un/registration to ensure that no + * sessions are currently using the handler, and no new sessions can be created + * while un/registration is in progress. + * + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + * + * Return: + * true - System is quiescent (0 sessions) and locked. + * false - Active sessions exist. The locks are released internally. + */ +bool luo_session_quiesce(void) +{ + down_write(&luo_session_global.incoming.rwsem); + down_write(&luo_session_global.outgoing.rwsem); + + if (luo_session_global.incoming.count || + luo_session_global.outgoing.count) { + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); + return false; + } + + return true; +} + +/** + * luo_session_resume - Unlock session lists and resume normal activity. + * + * Releases the exclusive locks acquired by a successful call to + * luo_session_quiesce(). + */ +void luo_session_resume(void) +{ + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); +} From 81cd25d263a182b3dcdc8af3b92e4b8e4db336de Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:35 -0500 Subject: [PATCH 113/139] liveupdate: luo_core: add user interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the user-space interface for the Live Update Orchestrator via ioctl commands, enabling external control over the live update process and management of preserved resources. The idea is that there is going to be a single userspace agent driving the live update, therefore, only a single process can ever hold this device opened at a time. The following ioctl commands are introduced: LIVEUPDATE_IOCTL_CREATE_SESSION Provides a way for userspace to create a named session for grouping file descriptors that need to be preserved. It returns a new file descriptor representing the session. LIVEUPDATE_IOCTL_RETRIEVE_SESSION Allows the userspace agent in the new kernel to reclaim a preserved session by its name, receiving a new file descriptor to manage the restored resources. Link: https://lkml.kernel.org/r/20251125165850.3389713-6-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/uapi/linux/liveupdate.h | 64 +++++++++++ kernel/liveupdate/luo_core.c | 178 +++++++++++++++++++++++++++++++ kernel/liveupdate/luo_internal.h | 21 ++++ 3 files changed, 263 insertions(+) diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h index 40578ae19668..1183cf984b5f 100644 --- a/include/uapi/linux/liveupdate.h +++ b/include/uapi/linux/liveupdate.h @@ -46,4 +46,68 @@ /* The maximum length of session name including null termination */ #define LIVEUPDATE_SESSION_NAME_LENGTH 64 +/* The /dev/liveupdate ioctl commands */ +enum { + LIVEUPDATE_CMD_BASE = 0x00, + LIVEUPDATE_CMD_CREATE_SESSION = LIVEUPDATE_CMD_BASE, + LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01, +}; + +/** + * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION) + * @size: Input; sizeof(struct liveupdate_ioctl_create_session) + * @fd: Output; The new file descriptor for the created session. + * @name: Input; A null-terminated string for the session name, max + * length %LIVEUPDATE_SESSION_NAME_LENGTH including termination + * character. + * + * Creates a new live update session for managing preserved resources. + * This ioctl can only be called on the main /dev/liveupdate device. + * + * Return: 0 on success, negative error code on failure. + */ +struct liveupdate_ioctl_create_session { + __u32 size; + __s32 fd; + __u8 name[LIVEUPDATE_SESSION_NAME_LENGTH]; +}; + +#define LIVEUPDATE_IOCTL_CREATE_SESSION \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_CREATE_SESSION) + +/** + * struct liveupdate_ioctl_retrieve_session - ioctl(LIVEUPDATE_IOCTL_RETRIEVE_SESSION) + * @size: Input; sizeof(struct liveupdate_ioctl_retrieve_session) + * @fd: Output; The new file descriptor for the retrieved session. + * @name: Input; A null-terminated string identifying the session to retrieve. + * The name must exactly match the name used when the session was + * created in the previous kernel. + * + * Retrieves a handle (a new file descriptor) for a preserved session by its + * name. This is the primary mechanism for a userspace agent to regain control + * of its preserved resources after a live update. + * + * The userspace application provides the null-terminated `name` of a session + * it created before the live update. If a preserved session with a matching + * name is found, the kernel instantiates it and returns a new file descriptor + * in the `fd` field. This new session FD can then be used for all file-specific + * operations, such as restoring individual file descriptors with + * LIVEUPDATE_SESSION_RETRIEVE_FD. + * + * It is the responsibility of the userspace application to know the names of + * the sessions it needs to retrieve. If no session with the given name is + * found, the ioctl will fail with -ENOENT. + * + * This ioctl can only be called on the main /dev/liveupdate device when the + * system is in the LIVEUPDATE_STATE_UPDATED state. + */ +struct liveupdate_ioctl_retrieve_session { + __u32 size; + __s32 fd; + __u8 name[LIVEUPDATE_SESSION_NAME_LENGTH]; +}; + +#define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION) + #endif /* _UAPI_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index a0f7788cd003..f7ecaf7740d1 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -41,7 +41,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include #include +#include #include #include #include @@ -246,12 +252,183 @@ bool liveupdate_enabled(void) return luo_global.enabled; } +/** + * DOC: LUO ioctl Interface + * + * The IOCTL user-space control interface for the LUO subsystem. + * It registers a character device, typically found at ``/dev/liveupdate``, + * which allows a userspace agent to manage the LUO state machine and its + * associated resources, such as preservable file descriptors. + * + * To ensure that the state machine is controlled by a single entity, access + * to this device is exclusive: only one process is permitted to have + * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will + * fail with -EBUSY until the first process closes its file descriptor. + * This singleton model simplifies state management by preventing conflicting + * commands from multiple userspace agents. + */ + struct luo_device_state { struct miscdevice miscdev; + atomic_t in_use; }; +static int luo_ioctl_create_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_create_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_create(argp->name, &file); + if (err) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_retrieve(argp->name, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_open(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + + if (atomic_cmpxchg(&ldev->in_use, 0, 1)) + return -EBUSY; + + /* Always return -EIO to user if deserialization fail */ + if (luo_session_deserialize()) { + atomic_set(&ldev->in_use, 0); + return -EIO; + } + + return 0; +} + +static int luo_release(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + atomic_set(&ldev->in_use, 0); + + return 0; +} + +union ucmd_buffer { + struct liveupdate_ioctl_create_session create; + struct liveupdate_ioctl_retrieve_session retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session, + struct liveupdate_ioctl_create_session, name), + IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session, + struct liveupdate_ioctl_retrieve_session, name), +}; + +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int err; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_BASE || + (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (err) + return err; + + op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (err) + return err; + + return op->execute(&ucmd); +} + static const struct file_operations luo_fops = { .owner = THIS_MODULE, + .open = luo_open, + .release = luo_release, + .unlocked_ioctl = luo_ioctl, }; static struct luo_device_state luo_dev = { @@ -260,6 +437,7 @@ static struct luo_device_state luo_dev = { .name = "liveupdate", .fops = &luo_fops, }, + .in_use = ATOMIC_INIT(0), }; static int __init liveupdate_ioctl_init(void) diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 05ae91695ec6..1292ac47eef8 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -9,6 +9,27 @@ #define _LINUX_LUO_INTERNAL_H #include +#include + +struct luo_ucmd { + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, + size_t kernel_cmd_size) +{ + /* + * Copy the minimum of what the user provided and what we actually + * have. + */ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, kernel_cmd_size))) { + return -EFAULT; + } + return 0; +} /* * Handles a deserialization failure: devices and memory is in unpredictable From 7c722a7f44e0c1f9714084152226bc7bd644b7e3 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:36 -0500 Subject: [PATCH 114/139] liveupdate: luo_file: implement file systems callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the core mechanism for managing preserved files throughout the live update lifecycle. It provides the logic to invoke the file handler callbacks (preserve, unpreserve, freeze, unfreeze, retrieve, and finish) at the appropriate stages. During the reboot phase, luo_file_freeze() serializes the final metadata for each file (handler compatible string, token, and data handle) into a memory region preserved by KHO. In the new kernel, luo_file_deserialize() reconstructs the in-memory file list from this data, preparing the session for retrieval. Link: https://lkml.kernel.org/r/20251125165850.3389713-7-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 39 +- include/linux/liveupdate.h | 98 ++++ kernel/liveupdate/Makefile | 1 + kernel/liveupdate/luo_file.c | 880 +++++++++++++++++++++++++++++++ kernel/liveupdate/luo_internal.h | 38 ++ 5 files changed, 1055 insertions(+), 1 deletion(-) create mode 100644 kernel/liveupdate/luo_file.c diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index bf1ab2910959..bb099c92e469 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -69,6 +69,11 @@ * Metadata for a single session, including its name and a physical pointer * to another preserved memory block containing an array of * `struct luo_file_ser` for all files in that session. + * + * - struct luo_file_ser: + * Metadata for a single preserved file. Contains the `compatible` string to + * find the correct handler in the new kernel, a user-provided `token` for + * identification, and an opaque `data` handle for the handler to use. */ #ifndef _LINUX_KHO_ABI_LUO_H @@ -86,13 +91,43 @@ #define LUO_FDT_COMPATIBLE "luo-v1" #define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number" +#define LIVEUPDATE_HNDL_COMPAT_LENGTH 48 + +/** + * struct luo_file_ser - Represents the serialized preserves files. + * @compatible: File handler compatible string. + * @data: Private data + * @token: User provided token for this file + * + * If this structure is modified, LUO_SESSION_COMPATIBLE must be updated. + */ +struct luo_file_ser { + char compatible[LIVEUPDATE_HNDL_COMPAT_LENGTH]; + u64 data; + u64 token; +} __packed; + +/** + * struct luo_file_set_ser - Represents the serialized metadata for file set + * @files: The physical address of a contiguous memory block that holds + * the serialized state of files (array of luo_file_ser) in this file + * set. + * @count: The total number of files that were part of this session during + * serialization. Used for iteration and validation during + * restoration. + */ +struct luo_file_set_ser { + u64 files; + u64 count; +} __packed; + /* * LUO FDT session node * LUO_FDT_SESSION_HEADER: is a u64 physical address of struct * luo_session_header_ser */ #define LUO_FDT_SESSION_NODE_NAME "luo-session" -#define LUO_FDT_SESSION_COMPATIBLE "luo-session-v1" +#define LUO_FDT_SESSION_COMPATIBLE "luo-session-v2" #define LUO_FDT_SESSION_HEADER "luo-session-header" /** @@ -114,6 +149,7 @@ struct luo_session_header_ser { * struct luo_session_ser - Represents the serialized metadata for a LUO session. * @name: The unique name of the session, provided by the userspace at * the time of session creation. + * @file_set_ser: Serialized files belonging to this session, * * This structure is used to package session-specific metadata for transfer * between kernels via Kexec Handover. An array of these structures (one per @@ -124,6 +160,7 @@ struct luo_session_header_ser { */ struct luo_session_ser { char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + struct luo_file_set_ser file_set_ser; } __packed; #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index c6a1d6bd90cb..122ad8f16ff9 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -8,8 +8,93 @@ #define _LINUX_LIVEUPDATE_H #include +#include +#include #include #include +#include + +struct liveupdate_file_handler; +struct file; + +/** + * struct liveupdate_file_op_args - Arguments for file operation callbacks. + * @handler: The file handler being called. + * @retrieved: The retrieve status for the 'can_finish / finish' + * operation. + * @file: The file object. For retrieve: [OUT] The callback sets + * this to the new file. For other ops: [IN] The caller sets + * this to the file being operated on. + * @serialized_data: The opaque u64 handle, preserve/prepare/freeze may update + * this field. + * + * This structure bundles all parameters for the file operation callbacks. + * The 'data' and 'file' fields are used for both input and output. + */ +struct liveupdate_file_op_args { + struct liveupdate_file_handler *handler; + bool retrieved; + struct file *file; + u64 serialized_data; +}; + +/** + * struct liveupdate_file_ops - Callbacks for live-updatable files. + * @can_preserve: Required. Lightweight check to see if this handler is + * compatible with the given file. + * @preserve: Required. Performs state-saving for the file. + * @unpreserve: Required. Cleans up any resources allocated by @preserve. + * @freeze: Optional. Final actions just before kernel transition. + * @unfreeze: Optional. Undo freeze operations. + * @retrieve: Required. Restores the file in the new kernel. + * @can_finish: Optional. Check if this FD can finish, i.e. all restoration + * pre-requirements for this FD are satisfied. Called prior to + * finish, in order to do successful finish calls for all + * resources in the session. + * @finish: Required. Final cleanup in the new kernel. + * @owner: Module reference + * + * All operations (except can_preserve) receive a pointer to a + * 'struct liveupdate_file_op_args' containing the necessary context. + */ +struct liveupdate_file_ops { + bool (*can_preserve)(struct liveupdate_file_handler *handler, + struct file *file); + int (*preserve)(struct liveupdate_file_op_args *args); + void (*unpreserve)(struct liveupdate_file_op_args *args); + int (*freeze)(struct liveupdate_file_op_args *args); + void (*unfreeze)(struct liveupdate_file_op_args *args); + int (*retrieve)(struct liveupdate_file_op_args *args); + bool (*can_finish)(struct liveupdate_file_op_args *args); + void (*finish)(struct liveupdate_file_op_args *args); + struct module *owner; +}; + +/** + * struct liveupdate_file_handler - Represents a handler for a live-updatable file type. + * @ops: Callback functions + * @compatible: The compatibility string (e.g., "memfd-v1", "vfiofd-v1") + * that uniquely identifies the file type this handler + * supports. This is matched against the compatible string + * associated with individual &struct file instances. + * + * Modules that want to support live update for specific file types should + * register an instance of this structure. LUO uses this registration to + * determine if a given file can be preserved and to find the appropriate + * operations to manage its state across the update. + */ +struct liveupdate_file_handler { + const struct liveupdate_file_ops *ops; + const char compatible[LIVEUPDATE_HNDL_COMPAT_LENGTH]; + + /* private: */ + + /* + * Used for linking this handler instance into a global list of + * registered file handlers. + */ + struct list_head __private list; +}; #ifdef CONFIG_LIVEUPDATE @@ -19,6 +104,9 @@ bool liveupdate_enabled(void); /* Called during kexec to tell LUO that entered into reboot */ int liveupdate_reboot(void); +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); +int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); + #else /* CONFIG_LIVEUPDATE */ static inline bool liveupdate_enabled(void) @@ -31,5 +119,15 @@ static inline int liveupdate_reboot(void) return 0; } +static inline int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LIVEUPDATE */ #endif /* _LINUX_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index 6af93caa58cf..7cad2eece32d 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -2,6 +2,7 @@ luo-y := \ luo_core.o \ + luo_file.o \ luo_session.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c new file mode 100644 index 000000000000..e9727cb1275a --- /dev/null +++ b/kernel/liveupdate/luo_file.c @@ -0,0 +1,880 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/** + * DOC: LUO File Descriptors + * + * LUO provides the infrastructure to preserve specific, stateful file + * descriptors across a kexec-based live update. The primary goal is to allow + * workloads, such as virtual machines using vfio, memfd, or iommufd, to + * retain access to their essential resources without interruption. + * + * The framework is built around a callback-based handler model and a well- + * defined lifecycle for each preserved file. + * + * Handler Registration: + * Kernel modules responsible for a specific file type (e.g., memfd, vfio) + * register a &struct liveupdate_file_handler. This handler provides a set of + * callbacks that LUO invokes at different stages of the update process, most + * notably: + * + * - can_preserve(): A lightweight check to determine if the handler is + * compatible with a given 'struct file'. + * - preserve(): The heavyweight operation that saves the file's state and + * returns an opaque u64 handle. This is typically performed while the + * workload is still active to minimize the downtime during the + * actual reboot transition. + * - unpreserve(): Cleans up any resources allocated by .preserve(), called + * if the preservation process is aborted before the reboot (i.e. session is + * closed). + * - freeze(): A final pre-reboot opportunity to prepare the state for kexec. + * We are already in reboot syscall, and therefore userspace cannot mutate + * the file anymore. + * - unfreeze(): Undoes the actions of .freeze(), called if the live update + * is aborted after the freeze phase. + * - retrieve(): Reconstructs the file in the new kernel from the preserved + * handle. + * - finish(): Performs final check and cleanup in the new kernel. After + * succesul finish call, LUO gives up ownership to this file. + * + * File Preservation Lifecycle happy path: + * + * 1. Preserve (Normal Operation): A userspace agent preserves files one by one + * via an ioctl. For each file, luo_preserve_file() finds a compatible + * handler, calls its .preserve() operation, and creates an internal &struct + * luo_file to track the live state. + * + * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called. + * It iterates through all preserved files, calls their respective .freeze() + * operation, and serializes their final metadata (compatible string, token, + * and data handle) into a contiguous memory block for KHO. + * + * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets + * deserialized (which is when /dev/liveupdate is first opened). It reads the + * serialized data from the KHO memory region and reconstructs the in-memory + * list of &struct luo_file instances for the new kernel, linking them to + * their corresponding handlers. + * + * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now + * restore file descriptors by providing a token. luo_retrieve_file() + * searches for the matching token, calls the handler's .retrieve() op to + * re-create the 'struct file', and returns a new FD. Files can be + * retrieved in ANY order. + * + * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete, + * luo_file_finish() is called. It iterates through all files, invokes their + * .finish() operations for final cleanup, and releases all associated kernel + * resources. + * + * File Preservation Lifecycle unhappy paths: + * + * 1. Abort Before Reboot: If the userspace agent aborts the live update + * process before calling reboot (e.g., by closing the session file + * descriptor), the session's release handler calls + * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on + * all preserved files, ensuring all allocated resources are cleaned up and + * returning the system to a clean state. + * + * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze() + * op fails, the .unfreeze() op is invoked on all previously *successful* + * freezes to roll back their state. The reboot() syscall then returns an + * error to userspace, canceling the live update. + * + * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails, + * the luo_file_finish() operation is aborted. LUO retains ownership of + * all files within that session, including those that were not yet + * processed. The userspace agent can attempt to call the finish operation + * again later. If the issue cannot be resolved, these resources will be held + * by LUO until the next live update cycle, at which point they will be + * discarded. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "luo_internal.h" + +static LIST_HEAD(luo_file_handler_list); + +/* 2 4K pages, give space for 128 files per file_set */ +#define LUO_FILE_PGCNT 2ul +#define LUO_FILE_MAX \ + ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser)) + +/** + * struct luo_file - Represents a single preserved file instance. + * @fh: Pointer to the &struct liveupdate_file_handler that manages + * this type of file. + * @file: Pointer to the kernel's &struct file that is being preserved. + * This is NULL in the new kernel until the file is successfully + * retrieved. + * @serialized_data: The opaque u64 handle to the serialized state of the file. + * This handle is passed back to the handler's .freeze(), + * .retrieve(), and .finish() callbacks, allowing it to track + * and update its serialized state across phases. + * @retrieved: A flag indicating whether a user/kernel in the new kernel has + * successfully called retrieve() on this file. This prevents + * multiple retrieval attempts. + * @mutex: A mutex that protects the fields of this specific instance + * (e.g., @retrieved, @file), ensuring that operations like + * retrieving or finishing a file are atomic. + * @list: The list_head linking this instance into its parent + * file_set's list of preserved files. + * @token: The user-provided unique token used to identify this file. + * + * This structure is the core in-kernel representation of a single file being + * managed through a live update. An instance is created by luo_preserve_file() + * to link a 'struct file' to its corresponding handler, a user-provided token, + * and the serialized state handle returned by the handler's .preserve() + * operation. + * + * These instances are tracked in a per-file_set list. The @serialized_data + * field, which holds a handle to the file's serialized state, may be updated + * during the .freeze() callback before being serialized for the next kernel. + * After reboot, these structures are recreated by luo_file_deserialize() and + * are finally cleaned up by luo_file_finish(). + */ +struct luo_file { + struct liveupdate_file_handler *fh; + struct file *file; + u64 serialized_data; + bool retrieved; + struct mutex mutex; + struct list_head list; + u64 token; +}; + +static int luo_alloc_files_mem(struct luo_file_set *file_set) +{ + size_t size; + void *mem; + + if (file_set->files) + return 0; + + WARN_ON_ONCE(file_set->count); + + size = LUO_FILE_PGCNT << PAGE_SHIFT; + mem = kho_alloc_preserve(size); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + file_set->files = mem; + + return 0; +} + +static void luo_free_files_mem(struct luo_file_set *file_set) +{ + /* If file_set has files, no need to free preservation memory */ + if (file_set->count) + return; + + if (!file_set->files) + return; + + kho_unpreserve_free(file_set->files); + file_set->files = NULL; +} + +static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) +{ + struct luo_file *iter; + + list_for_each_entry(iter, &file_set->files_list, list) { + if (iter->token == token) + return true; + } + + return false; +} + +/** + * luo_preserve_file - Initiate the preservation of a file descriptor. + * @file_set: The file_set to which the preserved file will be added. + * @token: A unique, user-provided identifier for the file. + * @fd: The file descriptor to be preserved. + * + * This function orchestrates the first phase of preserving a file. Upon entry, + * it takes a reference to the 'struct file' via fget(), effectively making LUO + * a co-owner of the file. This reference is held until the file is either + * unpreserved or successfully finished in the next kernel, preventing the file + * from being prematurely destroyed. + * + * This function orchestrates the first phase of preserving a file. It performs + * the following steps: + * + * 1. Validates that the @token is not already in use within the file_set. + * 2. Ensures the file_set's memory for files serialization is allocated + * (allocates if needed). + * 3. Iterates through registered handlers, calling can_preserve() to find one + * compatible with the given @fd. + * 4. Calls the handler's .preserve() operation, which saves the file's state + * and returns an opaque private data handle. + * 5. Adds the new instance to the file_set's internal list. + * + * On success, LUO takes a reference to the 'struct file' and considers it + * under its management until it is unpreserved or finished. + * + * In case of any failure, all intermediate allocations (file reference, memory + * for the 'luo_file' struct, etc.) are cleaned up before returning an error. + * + * Context: Can be called from an ioctl handler during normal system operation. + * Return: 0 on success. Returns a negative errno on failure: + * -EEXIST if the token is already used. + * -EBADF if the file descriptor is invalid. + * -ENOSPC if the file_set is full. + * -ENOENT if no compatible handler is found. + * -ENOMEM on memory allocation failure. + * Other erros might be returned by .preserve(). + */ +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) +{ + struct liveupdate_file_op_args args = {0}; + struct liveupdate_file_handler *fh; + struct luo_file *luo_file; + struct file *file; + int err; + + if (luo_token_is_used(file_set, token)) + return -EEXIST; + + if (file_set->count == LUO_FILE_MAX) + return -ENOSPC; + + file = fget(fd); + if (!file) + return -EBADF; + + err = luo_alloc_files_mem(file_set); + if (err) + goto err_fput; + + err = -ENOENT; + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (fh->ops->can_preserve(fh, file)) { + err = 0; + break; + } + } + + /* err is still -ENOENT if no handler was found */ + if (err) + goto err_free_files_mem; + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) { + err = -ENOMEM; + goto err_free_files_mem; + } + + luo_file->file = file; + luo_file->fh = fh; + luo_file->token = token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + + args.handler = fh; + args.file = file; + err = fh->ops->preserve(&args); + if (err) + goto err_kfree; + + luo_file->serialized_data = args.serialized_data; + list_add_tail(&luo_file->list, &file_set->files_list); + file_set->count++; + + return 0; + +err_kfree: + kfree(luo_file); +err_free_files_mem: + luo_free_files_mem(file_set); +err_fput: + fput(file); + + return err; +} + +/** + * luo_file_unpreserve_files - Unpreserves all files from a file_set. + * @file_set: The files to be cleaned up. + * + * This function serves as the primary cleanup path for a file_set. It is + * invoked when the userspace agent closes the file_set's file descriptor. + * + * For each file, it performs the following cleanup actions: + * 1. Calls the handler's .unpreserve() callback to allow the handler to + * release any resources it allocated. + * 2. Removes the file from the file_set's internal tracking list. + * 3. Releases the reference to the 'struct file' that was taken by + * luo_preserve_file() via fput(), returning ownership. + * 4. Frees the memory associated with the internal 'struct luo_file'. + * + * After all individual files are unpreserved, it frees the contiguous memory + * block that was allocated to hold their serialization data. + */ +void luo_file_unpreserve_files(struct luo_file_set *file_set) +{ + struct luo_file *luo_file; + + while (!list_empty(&file_set->files_list)) { + struct liveupdate_file_op_args args = {0}; + + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + luo_file->fh->ops->unpreserve(&args); + + list_del(&luo_file->list); + file_set->count--; + + fput(luo_file->file); + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + luo_free_files_mem(file_set); +} + +static int luo_file_freeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + int err = 0; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->freeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + + err = luo_file->fh->ops->freeze(&args); + if (!err) + luo_file->serialized_data = args.serialized_data; + } + + return err; +} + +static void luo_file_unfreeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->unfreeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + + luo_file->fh->ops->unfreeze(&args); + } + + luo_file->serialized_data = 0; +} + +static void __luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file *failed_entry) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + + list_for_each_entry(luo_file, files_list, list) { + if (luo_file == failed_entry) + break; + + luo_file_unfreeze_one(file_set, luo_file); + } + + memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT); +} + +/** + * luo_file_freeze - Freezes all preserved files and serializes their metadata. + * @file_set: The file_set whose files are to be frozen. + * @file_set_ser: Where to put the serialized file_set. + * + * This function is called from the reboot() syscall path, just before the + * kernel transitions to the new image via kexec. Its purpose is to perform the + * final preparation and serialization of all preserved files in the file_set. + * + * It iterates through each preserved file in FIFO order (the order of + * preservation) and performs two main actions: + * + * 1. Freezes the File: It calls the handler's .freeze() callback for each + * file. This gives the handler a final opportunity to quiesce the device or + * prepare its state for the upcoming reboot. The handler may update its + * private data handle during this step. + * + * 2. Serializes Metadata: After a successful freeze, it copies the final file + * metadata—the handler's compatible string, the user token, and the final + * private data handle—into the pre-allocated contiguous memory buffer + * (file_set->files) that will be handed over to the next kernel via KHO. + * + * Error Handling (Rollback): + * This function is atomic. If any handler's .freeze() operation fails, the + * entire live update is aborted. The __luo_file_unfreeze() helper is + * immediately called to invoke the .unfreeze() op on all files that were + * successfully frozen before the point of failure, rolling them back to a + * running state. The function then returns an error, causing the reboot() + * syscall to fail. + * + * Context: Called only from the liveupdate_reboot() path. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser = file_set->files; + struct luo_file *luo_file; + int err; + int i; + + if (!file_set->count) + return 0; + + if (WARN_ON(!file_ser)) + return -EINVAL; + + i = 0; + list_for_each_entry(luo_file, &file_set->files_list, list) { + err = luo_file_freeze_one(file_set, luo_file); + if (err < 0) { + pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n", + luo_file->token, luo_file->fh->compatible, + ERR_PTR(err)); + goto err_unfreeze; + } + + strscpy(file_ser[i].compatible, luo_file->fh->compatible, + sizeof(file_ser[i].compatible)); + file_ser[i].data = luo_file->serialized_data; + file_ser[i].token = luo_file->token; + i++; + } + + file_set_ser->count = file_set->count; + if (file_set->files) + file_set_ser->files = virt_to_phys(file_set->files); + + return 0; + +err_unfreeze: + __luo_file_unfreeze(file_set, luo_file); + + return err; +} + +/** + * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization + * @file_set: The file_set whose files are to be unfrozen. + * @file_set_ser: Serialized file_set. + * + * This function rolls back the state of all files in a file_set after the + * freeze phase has begun but must be aborted. It is the counterpart to + * luo_file_freeze(). + * + * It invokes the __luo_file_unfreeze() helper with a NULL argument, which + * signals the helper to iterate through all files in the file_set and call + * their respective .unfreeze() handler callbacks. + * + * Context: This is called when the live update is aborted during + * the reboot() syscall, after luo_file_freeze() has been called. + */ +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + if (!file_set->count) + return; + + __luo_file_unfreeze(file_set, NULL); + memset(file_set_ser, 0, sizeof(*file_set_ser)); +} + +/** + * luo_retrieve_file - Restores a preserved file from a file_set by its token. + * @file_set: The file_set from which to retrieve the file. + * @token: The unique token identifying the file to be restored. + * @filep: Output parameter; on success, this is populated with a pointer + * to the newly retrieved 'struct file'. + * + * This function is the primary mechanism for recreating a file in the new + * kernel after a live update. It searches the file_set's list of deserialized + * files for an entry matching the provided @token. + * + * The operation is idempotent: if a file has already been successfully + * retrieved, this function will simply return a pointer to the existing + * 'struct file' and report success without re-executing the retrieve + * operation. This is handled by checking the 'retrieved' flag under a lock. + * + * File retrieval can happen in any order; it is not bound by the order of + * preservation. + * + * Context: Can be called from an ioctl or other in-kernel code in the new + * kernel. + * Return: 0 on success. Returns a negative errno on failure: + * -ENOENT if no file with the matching token is found. + * Any error code returned by the handler's .retrieve() op. + */ +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep) +{ + struct liveupdate_file_op_args args = {0}; + struct luo_file *luo_file; + int err; + + if (list_empty(&file_set->files_list)) + return -ENOENT; + + list_for_each_entry(luo_file, &file_set->files_list, list) { + if (luo_file->token == token) + break; + } + + if (luo_file->token != token) + return -ENOENT; + + guard(mutex)(&luo_file->mutex); + if (luo_file->retrieved) { + /* + * Someone is asking for this file again, so get a reference + * for them. + */ + get_file(luo_file->file); + *filep = luo_file->file; + return 0; + } + + args.handler = luo_file->fh; + args.serialized_data = luo_file->serialized_data; + err = luo_file->fh->ops->retrieve(&args); + if (!err) { + luo_file->file = args.file; + + /* Get reference so we can keep this file in LUO until finish */ + get_file(luo_file->file); + *filep = luo_file->file; + luo_file->retrieved = true; + } + + return err; +} + +static int luo_file_can_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + bool can_finish = true; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->can_finish) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + can_finish = luo_file->fh->ops->can_finish(&args); + } + + return can_finish ? 0 : -EBUSY; +} + +static void luo_file_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + struct liveupdate_file_op_args args = {0}; + + guard(mutex)(&luo_file->mutex); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + + luo_file->fh->ops->finish(&args); +} + +/** + * luo_file_finish - Completes the lifecycle for all files in a file_set. + * @file_set: The file_set to be finalized. + * + * This function orchestrates the final teardown of a live update file_set in + * the new kernel. It should be called after all necessary files have been + * retrieved and the userspace agent is ready to release the preserved state. + * + * The function iterates through all tracked files. For each file, it performs + * the following sequence of cleanup actions: + * + * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on + * every file in the file_set. If all can_finish return true, continue to + * finish. + * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to + * allow for final resource cleanup within the handler. + * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This + * is the counterpart to the get_file() call in luo_retrieve_file(). + * 4. Removes the 'struct luo_file' from the file_set's internal list. + * 5. Frees the memory for the 'struct luo_file' instance itself. + * + * After successfully finishing all individual files, it frees the + * contiguous memory block that was used to transfer the serialized metadata + * from the previous kernel. + * + * Error Handling (Atomic Failure): + * This operation is atomic. If any handler's .can_finish() op fails, the entire + * function aborts immediately and returns an error. + * + * Context: Can be called from an ioctl handler in the new kernel. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_finish(struct luo_file_set *file_set) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + int err; + + if (!file_set->count) + return 0; + + list_for_each_entry(luo_file, files_list, list) { + err = luo_file_can_finish_one(file_set, luo_file); + if (err) + return err; + } + + while (!list_empty(&file_set->files_list)) { + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + luo_file_finish_one(file_set, luo_file); + + if (luo_file->file) + fput(luo_file->file); + list_del(&luo_file->list); + file_set->count--; + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + if (file_set->files) { + kho_restore_free(file_set->files); + file_set->files = NULL; + } + + return 0; +} + +/** + * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel. + * @file_set: The incoming file_set to fill with deserialized data. + * @file_set_ser: Serialized KHO file_set data from the previous kernel. + * + * This function is called during the early boot process of the new kernel. It + * takes the raw, contiguous memory block of 'struct luo_file_ser' entries, + * provided by the previous kernel, and transforms it back into a live, + * in-memory linked list of 'struct luo_file' instances. + * + * For each serialized entry, it performs the following steps: + * 1. Reads the 'compatible' string. + * 2. Searches the global list of registered file handlers for one that + * matches the compatible string. + * 3. Allocates a new 'struct luo_file'. + * 4. Populates the new structure with the deserialized data (token, private + * data handle) and links it to the found handler. The 'file' pointer is + * initialized to NULL, as the file has not been retrieved yet. + * 5. Adds the new 'struct luo_file' to the file_set's files_list. + * + * This prepares the file_set for userspace, which can later call + * luo_retrieve_file() to restore the actual file descriptors. + * + * Context: Called from session deserialization. + */ +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser; + u64 i; + + if (!file_set_ser->files) { + WARN_ON(file_set_ser->count); + return 0; + } + + file_set->count = file_set_ser->count; + file_set->files = phys_to_virt(file_set_ser->files); + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of files that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + file_ser = file_set->files; + for (i = 0; i < file_set->count; i++) { + struct liveupdate_file_handler *fh; + bool handler_found = false; + struct luo_file *luo_file; + + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (!strcmp(fh->compatible, file_ser[i].compatible)) { + handler_found = true; + break; + } + } + + if (!handler_found) { + pr_warn("No registered handler for compatible '%s'\n", + file_ser[i].compatible); + return -ENOENT; + } + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) + return -ENOMEM; + + luo_file->fh = fh; + luo_file->file = NULL; + luo_file->serialized_data = file_ser[i].data; + luo_file->token = file_ser[i].token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + list_add_tail(&luo_file->list, &file_set->files_list); + } + + return 0; +} + +void luo_file_set_init(struct luo_file_set *file_set) +{ + INIT_LIST_HEAD(&file_set->files_list); +} + +void luo_file_set_destroy(struct luo_file_set *file_set) +{ + WARN_ON(file_set->count); + WARN_ON(!list_empty(&file_set->files_list)); +} + +/** + * liveupdate_register_file_handler - Register a file handler with LUO. + * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler. + * The caller must initialize this structure, including a unique + * 'compatible' string and a valid 'fh' callbacks. This function adds the + * handler to the global list of supported file handlers. + * + * Context: Typically called during module initialization for file types that + * support live update preservation. + * + * Return: 0 on success. Negative errno on failure. + */ +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) +{ + struct liveupdate_file_handler *fh_iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* Sanity check that all required callbacks are set */ + if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve || + !fh->ops->finish || !fh->ops->can_preserve) { + return -EINVAL; + } + + /* + * Ensure the system is quiescent (no active sessions). + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check for duplicate compatible strings */ + luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + if (!strcmp(fh_iter->compatible, fh->compatible)) { + pr_err("File handler registration failed: Compatible string '%s' already registered.\n", + fh->compatible); + err = -EEXIST; + goto err_resume; + } + } + + /* Pin the module implementing the handler */ + if (!try_module_get(fh->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); + list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_file_handler - Unregister a liveupdate file handler + * @fh: The file handler to unregister + * + * Unregisters the file handler from the liveupdate core. This function + * reverses the operations of liveupdate_register_file_handler(). + * + * It ensures safe removal by checking that: + * No live update session is currently in progress. + * + * If the unregistration fails, the internal test state is reverted. + * + * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live + * update is in progress, can't quiesce live update. + */ +int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +{ + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!luo_session_quiesce()) + return -EBUSY; + + list_del(&ACCESS_PRIVATE(fh, list)); + module_put(fh->ops->owner); + luo_session_resume(); + + return 0; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 1292ac47eef8..c8973b543d1d 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -40,6 +40,28 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, */ #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) +/* Mimics list_for_each_entry() but for private list head entries */ +#define luo_list_for_each_private(pos, head, member) \ + for (struct list_head *__iter = (head)->next; \ + __iter != (head) && \ + ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ + __iter = __iter->next) + +/** + * struct luo_file_set - A set of files that belong to the same sessions. + * @files_list: An ordered list of files associated with this session, it is + * ordered by preservation time. + * @files: The physically contiguous memory block that holds the serialized + * state of files. + * @count: A counter tracking the number of files currently stored in the + * @files_list for this session. + */ +struct luo_file_set { + struct list_head files_list; + struct luo_file_ser *files; + long count; +}; + /** * struct luo_session - Represents an active or incoming Live Update session. * @name: A unique name for this session, used for identification and @@ -50,6 +72,7 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, * previous kernel) sessions. * @retrieved: A boolean flag indicating whether this session has been * retrieved by a consumer in the new kernel. + * @file_set: A set of files that belong to this session. * @mutex: protects fields in the luo_session. */ struct luo_session { @@ -57,6 +80,7 @@ struct luo_session { struct luo_session_ser *ser; struct list_head list; bool retrieved; + struct luo_file_set file_set; struct mutex mutex; }; @@ -69,4 +93,18 @@ int luo_session_deserialize(void); bool luo_session_quiesce(void); void luo_session_resume(void); +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd); +void luo_file_unpreserve_files(struct luo_file_set *file_set); +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep); +int luo_file_finish(struct luo_file_set *file_set); +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_set_init(struct luo_file_set *file_set); +void luo_file_set_destroy(struct luo_file_set *file_set); + #endif /* _LINUX_LUO_INTERNAL_H */ From 16cec0d265219f14a7fcebcc43aeb69205adba56 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:37 -0500 Subject: [PATCH 115/139] liveupdate: luo_session: add ioctls for file preservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introducing the userspace interface and internal logic required to manage the lifecycle of file descriptors within a session. Previously, a session was merely a container; this change makes it a functional management unit. The following capabilities are added: A new set of ioctl commands are added, which operate on the file descriptor returned by CREATE_SESSION. This allows userspace to: - LIVEUPDATE_SESSION_PRESERVE_FD: Add a file descriptor to a session to be preserved across the live update. - LIVEUPDATE_SESSION_RETRIEVE_FD: Retrieve a preserved file in the new kernel using its unique token. - LIVEUPDATE_SESSION_FINISH: finish session The session's .release handler is enhanced to be state-aware. When a session's file descriptor is closed, it correctly unpreserves the session based on its current state before freeing all associated file resources. Link: https://lkml.kernel.org/r/20251125165850.3389713-8-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/uapi/linux/liveupdate.h | 103 ++++++++++++++++++ kernel/liveupdate/luo_session.c | 187 +++++++++++++++++++++++++++++++- 2 files changed, 288 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h index 1183cf984b5f..30bc66ee9436 100644 --- a/include/uapi/linux/liveupdate.h +++ b/include/uapi/linux/liveupdate.h @@ -53,6 +53,14 @@ enum { LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01, }; +/* ioctl commands for session file descriptors */ +enum { + LIVEUPDATE_CMD_SESSION_BASE = 0x40, + LIVEUPDATE_CMD_SESSION_PRESERVE_FD = LIVEUPDATE_CMD_SESSION_BASE, + LIVEUPDATE_CMD_SESSION_RETRIEVE_FD = 0x41, + LIVEUPDATE_CMD_SESSION_FINISH = 0x42, +}; + /** * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION) * @size: Input; sizeof(struct liveupdate_ioctl_create_session) @@ -110,4 +118,99 @@ struct liveupdate_ioctl_retrieve_session { #define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \ _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION) +/* Session specific IOCTLs */ + +/** + * struct liveupdate_session_preserve_fd - ioctl(LIVEUPDATE_SESSION_PRESERVE_FD) + * @size: Input; sizeof(struct liveupdate_session_preserve_fd) + * @fd: Input; The user-space file descriptor to be preserved. + * @token: Input; An opaque, unique token for preserved resource. + * + * Holds parameters for preserving a file descriptor. + * + * User sets the @fd field identifying the file descriptor to preserve + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type + * and its dependencies are supported for preservation. If validation passes, + * the kernel marks the FD internally and *initiates the process* of preparing + * its state for saving. The actual snapshotting of the state typically occurs + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though + * some finalization might occur during freeze. + * On successful validation and initiation, the kernel uses the @token + * field with an opaque identifier representing the resource being preserved. + * This token confirms the FD is targeted for preservation and is required for + * the subsequent %LIVEUPDATE_SESSION_RETRIEVE_FD call after the live update. + * + * Return: 0 on success (validation passed, preservation initiated), negative + * error code on failure (e.g., unsupported FD type, dependency issue, + * validation failed). + */ +struct liveupdate_session_preserve_fd { + __u32 size; + __s32 fd; + __aligned_u64 token; +}; + +#define LIVEUPDATE_SESSION_PRESERVE_FD \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_PRESERVE_FD) + +/** + * struct liveupdate_session_retrieve_fd - ioctl(LIVEUPDATE_SESSION_RETRIEVE_FD) + * @size: Input; sizeof(struct liveupdate_session_retrieve_fd) + * @fd: Output; The new file descriptor representing the fully restored + * kernel resource. + * @token: Input; An opaque, token that was used to preserve the resource. + * + * Retrieve a previously preserved file descriptor. + * + * User sets the @token field to the value obtained from a successful + * %LIVEUPDATE_IOCTL_FD_PRESERVE call before the live update. On success, + * the kernel restores the state (saved during the PREPARE/FREEZE phases) + * associated with the token and populates the @fd field with a new file + * descriptor referencing the restored resource in the current (new) kernel. + * This operation must be performed *before* signaling completion via + * %LIVEUPDATE_IOCTL_FINISH. + * + * Return: 0 on success, negative error code on failure (e.g., invalid token). + */ +struct liveupdate_session_retrieve_fd { + __u32 size; + __s32 fd; + __aligned_u64 token; +}; + +#define LIVEUPDATE_SESSION_RETRIEVE_FD \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_RETRIEVE_FD) + +/** + * struct liveupdate_session_finish - ioctl(LIVEUPDATE_SESSION_FINISH) + * @size: Input; sizeof(struct liveupdate_session_finish) + * @reserved: Input; Must be zero. Reserved for future use. + * + * Signals the completion of the restoration process for a retrieved session. + * This is the final operation that should be performed on a session file + * descriptor after a live update. + * + * This ioctl must be called once all required file descriptors for the session + * have been successfully retrieved (using %LIVEUPDATE_SESSION_RETRIEVE_FD) and + * are fully restored from the userspace and kernel perspective. + * + * Upon success, the kernel releases its ownership of the preserved resources + * associated with this session. This allows internal resources to be freed, + * typically by decrementing reference counts on the underlying preserved + * objects. + * + * If this operation fails, the resources remain preserved in memory. Userspace + * may attempt to call finish again. The resources will otherwise be reset + * during the next live update cycle. + * + * Return: 0 on success, negative error code on failure. + */ +struct liveupdate_session_finish { + __u32 size; + __u32 reserved; +}; + +#define LIVEUPDATE_SESSION_FINISH \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_FINISH) + #endif /* _UAPI_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index 3a031446d3a4..dbdbc3bd7929 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -125,6 +125,8 @@ static struct luo_session *luo_session_alloc(const char *name) return ERR_PTR(-ENOMEM); strscpy(session->name, name, sizeof(session->name)); + INIT_LIST_HEAD(&session->file_set.files_list); + luo_file_set_init(&session->file_set); INIT_LIST_HEAD(&session->list); mutex_init(&session->mutex); @@ -133,6 +135,7 @@ static struct luo_session *luo_session_alloc(const char *name) static void luo_session_free(struct luo_session *session) { + luo_file_set_destroy(&session->file_set); mutex_destroy(&session->mutex); kfree(session); } @@ -177,16 +180,46 @@ static void luo_session_remove(struct luo_session_header *sh, sh->count--; } +static int luo_session_finish_one(struct luo_session *session) +{ + guard(mutex)(&session->mutex); + return luo_file_finish(&session->file_set); +} + +static void luo_session_unfreeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + luo_file_unfreeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_freeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + return luo_file_freeze(&session->file_set, &ser->file_set_ser); +} + static int luo_session_release(struct inode *inodep, struct file *filep) { struct luo_session *session = filep->private_data; struct luo_session_header *sh; /* If retrieved is set, it means this session is from incoming list */ - if (session->retrieved) + if (session->retrieved) { + int err = luo_session_finish_one(session); + + if (err) { + pr_warn("Unable to finish session [%s] on release\n", + session->name); + return err; + } sh = &luo_session_global.incoming; - else + } else { + scoped_guard(mutex, &session->mutex) + luo_file_unpreserve_files(&session->file_set); sh = &luo_session_global.outgoing; + } luo_session_remove(sh, session); luo_session_free(session); @@ -194,9 +227,140 @@ static int luo_session_release(struct inode *inodep, struct file *filep) return 0; } +static int luo_session_preserve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_preserve_fd *argp = ucmd->cmd; + int err; + + guard(mutex)(&session->mutex); + err = luo_preserve_file(&session->file_set, argp->token, argp->fd); + if (err) + return err; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + pr_warn("The file was successfully preserved, but response to user failed\n"); + + return err; +} + +static int luo_session_retrieve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_retrieve_fd *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + guard(mutex)(&session->mutex); + err = luo_retrieve_file(&session->file_set, argp->token, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_session_finish(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_finish *argp = ucmd->cmd; + int err = luo_session_finish_one(session); + + if (err) + return err; + + return luo_ucmd_respond(ucmd, sizeof(*argp)); +} + +union ucmd_buffer { + struct liveupdate_session_finish finish; + struct liveupdate_session_preserve_fd preserve; + struct liveupdate_session_retrieve_fd retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_session_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish, + struct liveupdate_session_finish, reserved), + IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd, + struct liveupdate_session_preserve_fd, token), + IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd, + struct liveupdate_session_retrieve_fd, token), +}; + +static long luo_session_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct luo_session *session = filep->private_data; + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >= + ARRAY_SIZE(luo_session_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + + return op->execute(session, &ucmd); +} + static const struct file_operations luo_session_fops = { .owner = THIS_MODULE, .release = luo_session_release, + .unlocked_ioctl = luo_session_ioctl, }; /* Create a "struct file" for session */ @@ -392,6 +556,11 @@ int luo_session_deserialize(void) luo_session_free(session); return err; } + + scoped_guard(mutex, &session->mutex) { + luo_file_deserialize(&session->file_set, + &sh->ser[i].file_set_ser); + } } kho_restore_free(sh->header_ser); @@ -406,9 +575,14 @@ int luo_session_serialize(void) struct luo_session_header *sh = &luo_session_global.outgoing; struct luo_session *session; int i = 0; + int err; guard(rwsem_write)(&sh->rwsem); list_for_each_entry(session, &sh->list, list) { + err = luo_session_freeze_one(session, &sh->ser[i]); + if (err) + goto err_undo; + strscpy(sh->ser[i].name, session->name, sizeof(sh->ser[i].name)); i++; @@ -416,6 +590,15 @@ int luo_session_serialize(void) sh->header_ser->count = sh->count; return 0; + +err_undo: + list_for_each_entry_continue_reverse(session, &sh->list, list) { + i--; + luo_session_unfreeze_one(session, &sh->ser[i]); + memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name)); + } + + return err; } /** From 906a3306245525f408129153d5dcb7ae7e6ded44 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:38 -0500 Subject: [PATCH 116/139] docs: add luo documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the documentation files for the Live Update Orchestrator Link: https://lkml.kernel.org/r/20251125165850.3389713-9-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- Documentation/core-api/index.rst | 1 + Documentation/core-api/liveupdate.rst | 54 ++++++++++++++++++++++ Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/liveupdate.rst | 20 ++++++++ 4 files changed, 76 insertions(+) create mode 100644 Documentation/core-api/liveupdate.rst create mode 100644 Documentation/userspace-api/liveupdate.rst diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 6cbdcbfa79c3..5eb0fbbbc323 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -138,6 +138,7 @@ Documents that don't fit elsewhere or which have yet to be categorized. :maxdepth: 1 librs + liveupdate netlink .. only:: subproject and html diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst new file mode 100644 index 000000000000..cca1993008d8 --- /dev/null +++ b/Documentation/core-api/liveupdate.rst @@ -0,0 +1,54 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +Live Update Orchestrator +======================== +:Author: Pasha Tatashin + +.. kernel-doc:: kernel/liveupdate/luo_core.c + :doc: Live Update Orchestrator (LUO) + +LUO Sessions +============ +.. kernel-doc:: kernel/liveupdate/luo_session.c + :doc: LUO Sessions + +LUO Preserving File Descriptors +=============================== +.. kernel-doc:: kernel/liveupdate/luo_file.c + :doc: LUO File Descriptors + +Live Update Orchestrator ABI +============================ +.. kernel-doc:: include/linux/kho/abi/luo.h + :doc: Live Update Orchestrator ABI + +Public API +========== +.. kernel-doc:: include/linux/liveupdate.h + +.. kernel-doc:: include/linux/kho/abi/luo.h + :functions: + +.. kernel-doc:: kernel/liveupdate/luo_core.c + :export: + +.. kernel-doc:: kernel/liveupdate/luo_file.c + :export: + +Internal API +============ +.. kernel-doc:: kernel/liveupdate/luo_core.c + :internal: + +.. kernel-doc:: kernel/liveupdate/luo_session.c + :internal: + +.. kernel-doc:: kernel/liveupdate/luo_file.c + :internal: + +See Also +======== + +- :doc:`Live Update uAPI ` +- :doc:`/core-api/kho/concepts` diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index b8c73be4fb11..8a61ac4c1bf1 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -61,6 +61,7 @@ Everything else :maxdepth: 1 ELF + liveupdate netlink/index sysfs-platform_profile vduse diff --git a/Documentation/userspace-api/liveupdate.rst b/Documentation/userspace-api/liveupdate.rst new file mode 100644 index 000000000000..41c0473e4f16 --- /dev/null +++ b/Documentation/userspace-api/liveupdate.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +Live Update uAPI +================ +:Author: Pasha Tatashin + +ioctl interface +=============== +.. kernel-doc:: kernel/liveupdate/luo_core.c + :doc: LUO ioctl Interface + +ioctl uAPI +=========== +.. kernel-doc:: include/uapi/linux/liveupdate.h + +See Also +======== + +- :doc:`Live Update Orchestrator ` From 7a5afa7ea8235e6b786b425b8905b2a4ffed8f65 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:39 -0500 Subject: [PATCH 117/139] MAINTAINERS: add liveupdate entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a MAINTAINERS file entry for the new Live Update Orchestrator introduced in previous patches. Link: https://lkml.kernel.org/r/20251125165850.3389713-10-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- MAINTAINERS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b46425e3b4d3..868d3d23fdea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14466,6 +14466,18 @@ F: kernel/module/livepatch.c F: samples/livepatch/ F: tools/testing/selftests/livepatch/ +LIVE UPDATE +M: Pasha Tatashin +M: Mike Rapoport +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/core-api/liveupdate.rst +F: Documentation/userspace-api/liveupdate.rst +F: include/linux/liveupdate.h +F: include/linux/liveupdate/ +F: include/uapi/linux/liveupdate.h +F: kernel/liveupdate/ + LLC (802.2) L: netdev@vger.kernel.org S: Odd fixes From 6ff1610ced5689c9af4c28a1798e04b74128a703 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:40 -0500 Subject: [PATCH 118/139] mm: shmem: use SHMEM_F_* flags instead of VM_* flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shmem_inode_info::flags can have the VM flags VM_NORESERVE and VM_LOCKED. These are used to suppress pre-accounting or to lock the pages in the inode respectively. Using the VM flags directly makes it difficult to add shmem-specific flags that are unrelated to VM behavior since one would need to find a VM flag not used by shmem and re-purpose it. Introduce SHMEM_F_NORESERVE and SHMEM_F_LOCKED which represent the same information, but their bits are independent of the VM flags. Callers can still pass VM_NORESERVE to shmem_get_inode(), but it gets transformed to the shmem-specific flag internally. No functional changes intended. Link: https://lkml.kernel.org/r/20251125165850.3389713-11-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 6 ++++++ mm/shmem.c | 28 +++++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..650874b400b5 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -10,6 +10,7 @@ #include #include #include +#include struct swap_iocb; @@ -19,6 +20,11 @@ struct swap_iocb; #define SHMEM_MAXQUOTAS 2 #endif +/* Suppress pre-accounting of the entire object size. */ +#define SHMEM_F_NORESERVE BIT(0) +/* Disallow swapping. */ +#define SHMEM_F_LOCKED BIT(1) + struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ diff --git a/mm/shmem.c b/mm/shmem.c index 58701d14dd96..1d5036dec08a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -175,20 +175,20 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_NORESERVE) ? + return (flags & SHMEM_F_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { - if (!(flags & VM_NORESERVE)) { + if (!(flags & SHMEM_F_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); @@ -206,7 +206,7 @@ static inline int shmem_reacct_size(unsigned long flags, */ static inline int shmem_acct_blocks(unsigned long flags, long pages) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, @@ -215,7 +215,7 @@ static inline int shmem_acct_blocks(unsigned long flags, long pages) static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (flags & VM_NORESERVE) + if (flags & SHMEM_F_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } @@ -1551,7 +1551,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, int nr_pages; bool split = false; - if ((info->flags & VM_LOCKED) || sbinfo->noswap) + if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) @@ -2910,15 +2910,15 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ - if (lock && !(info->flags & VM_LOCKED)) { + if (lock && !(info->flags & SHMEM_F_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; - info->flags |= VM_LOCKED; + info->flags |= SHMEM_F_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && ucounts) { + if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); - info->flags &= ~VM_LOCKED; + info->flags &= ~SHMEM_F_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; @@ -3062,7 +3062,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = flags & VM_NORESERVE; + info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; @@ -5804,8 +5804,10 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, - loff_t size, unsigned long flags, unsigned int i_flags) + loff_t size, unsigned long vm_flags, + unsigned int i_flags) { + unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; @@ -5822,7 +5824,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, - S_IFREG | S_IRWXUGO, 0, flags); + S_IFREG | S_IRWXUGO, 0, vm_flags); if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); From e165e2a2577b048664be09c074a10304290055f0 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:41 -0500 Subject: [PATCH 119/139] mm: shmem: allow freezing inode mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare a shmem inode for live update, its index -> folio mappings must be serialized. Once the mappings are serialized, they cannot change since it would cause the serialized data to become inconsistent. This can be done by pinning the folios to avoid migration, and by making sure no folios can be added to or removed from the inode. While mechanisms to pin folios already exist, the only way to stop folios being added or removed are the grow and shrink file seals. But file seals come with their own semantics, one of which is that they can't be removed. This doesn't work with liveupdate since it can be cancelled or error out, which would need the seals to be removed and the file's normal functionality to be restored. Introduce SHMEM_F_MAPPING_FROZEN to indicate this instead. It is internal to shmem and is not directly exposed to userspace. It functions similar to F_SEAL_GROW | F_SEAL_SHRINK, but additionally disallows hole punching, and can be removed. Link: https://lkml.kernel.org/r/20251125165850.3389713-12-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 17 +++++++++++++++++ mm/shmem.c | 11 +++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 650874b400b5..d34a64eafe60 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -24,6 +24,14 @@ struct swap_iocb; #define SHMEM_F_NORESERVE BIT(0) /* Disallow swapping. */ #define SHMEM_F_LOCKED BIT(1) +/* + * Disallow growing, shrinking, or hole punching in the inode. Combined with + * folio pinning, makes sure the inode's mapping stays fixed. + * + * In some ways similar to F_SEAL_GROW | F_SEAL_SHRINK, but can be removed and + * isn't directly visible to userspace. + */ +#define SHMEM_F_MAPPING_FROZEN BIT(2) struct shmem_inode_info { spinlock_t lock; @@ -186,6 +194,15 @@ static inline bool shmem_file(struct file *file) return shmem_mapping(file->f_mapping); } +/* Must be called with inode lock taken exclusive. */ +static inline void shmem_freeze(struct inode *inode, bool freeze) +{ + if (freeze) + SHMEM_I(inode)->flags |= SHMEM_F_MAPPING_FROZEN; + else + SHMEM_I(inode)->flags &= ~SHMEM_F_MAPPING_FROZEN; +} + /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: diff --git a/mm/shmem.c b/mm/shmem.c index 1d5036dec08a..786573479360 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1297,6 +1297,8 @@ static int shmem_setattr(struct mnt_idmap *idmap, return -EPERM; if (newsize != oldsize) { + if (info->flags & SHMEM_F_MAPPING_FROZEN) + return -EPERM; error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) @@ -3289,6 +3291,10 @@ shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, return -EPERM; } + if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) && + pos + len > inode->i_size)) + return -EPERM; + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; @@ -3662,6 +3668,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); + if (info->flags & SHMEM_F_MAPPING_FROZEN) { + error = -EPERM; + goto out; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); From ed6f45f81bf99f3cef380ba0de8cad85007b5307 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:42 -0500 Subject: [PATCH 120/139] mm: shmem: export some functions to internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shmem_inode_acct_blocks(), shmem_recalc_inode(), and shmem_add_to_page_cache() are used by shmem_alloc_and_add_folio(). This functionality will be used by memfd LUO integration. Link: https://lkml.kernel.org/r/20251125165850.3389713-13-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- mm/internal.h | 6 ++++++ mm/shmem.c | 10 +++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..4ba155524f80 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1562,6 +1562,12 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid); unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); +int shmem_add_to_page_cache(struct folio *folio, + struct address_space *mapping, + pgoff_t index, void *expected, gfp_t gfp); +int shmem_inode_acct_blocks(struct inode *inode, long pages); +bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped); + #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) diff --git a/mm/shmem.c b/mm/shmem.c index 786573479360..679721e48a87 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -219,7 +219,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static int shmem_inode_acct_blocks(struct inode *inode, long pages) +int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -435,7 +435,7 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) * * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) +bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; @@ -861,9 +861,9 @@ static void shmem_update_stats(struct folio *folio, int nr_pages) /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ -static int shmem_add_to_page_cache(struct folio *folio, - struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) +int shmem_add_to_page_cache(struct folio *folio, + struct address_space *mapping, + pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); unsigned long nr = folio_nr_pages(folio); From 8def18633e8df54a05cf7d323d0df24c21b320d6 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:43 -0500 Subject: [PATCH 121/139] liveupdate: luo_file: add private argument to store runtime state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently file handlers only get the serialized_data field to store their state. This field has a pointer to the serialized state of the file, and it becomes a part of LUO file's serialized state. File handlers can also need some runtime state to track information that shouldn't make it in the serialized data. One such example is a vmalloc pointer. While kho_preserve_vmalloc() preserves the memory backing a vmalloc allocation, it does not store the original vmap pointer, since that has no use being passed to the next kernel. The pointer is needed to free the memory in case the file is unpreserved. Provide a private field in struct luo_file and pass it to all the callbacks. The field's can be set by preserve, and must be freed by unpreserve. Link: https://lkml.kernel.org/r/20251125165850.3389713-14-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 5 +++++ kernel/liveupdate/luo_file.c | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 122ad8f16ff9..a7f6ee5b6771 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -27,6 +27,10 @@ struct file; * this to the file being operated on. * @serialized_data: The opaque u64 handle, preserve/prepare/freeze may update * this field. + * @private_data: Private data for the file used to hold runtime state that + * is not preserved. Set by the handler's .preserve() + * callback, and must be freed in the handler's + * .unpreserve() callback. * * This structure bundles all parameters for the file operation callbacks. * The 'data' and 'file' fields are used for both input and output. @@ -36,6 +40,7 @@ struct liveupdate_file_op_args { bool retrieved; struct file *file; u64 serialized_data; + void *private_data; }; /** diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index e9727cb1275a..ddff87917b21 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -129,6 +129,10 @@ static LIST_HEAD(luo_file_handler_list); * This handle is passed back to the handler's .freeze(), * .retrieve(), and .finish() callbacks, allowing it to track * and update its serialized state across phases. + * @private_data: Pointer to the private data for the file used to hold runtime + * state that is not preserved. Set by the handler's .preserve() + * callback, and must be freed in the handler's .unpreserve() + * callback. * @retrieved: A flag indicating whether a user/kernel in the new kernel has * successfully called retrieve() on this file. This prevents * multiple retrieval attempts. @@ -155,6 +159,7 @@ struct luo_file { struct liveupdate_file_handler *fh; struct file *file; u64 serialized_data; + void *private_data; bool retrieved; struct mutex mutex; struct list_head list; @@ -298,6 +303,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) goto err_kfree; luo_file->serialized_data = args.serialized_data; + luo_file->private_data = args.private_data; list_add_tail(&luo_file->list, &file_set->files_list); file_set->count++; @@ -344,6 +350,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; luo_file->fh->ops->unpreserve(&args); list_del(&luo_file->list); @@ -370,6 +377,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set, args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; err = luo_file->fh->ops->freeze(&args); if (!err) @@ -390,6 +398,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set, args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; luo_file->fh->ops->unfreeze(&args); } From b3749f174d686627f702234e64bad976dc432dbc Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:44 -0500 Subject: [PATCH 122/139] mm: memfd_luo: allow preserving memfd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ability to preserve a memfd allows userspace to use KHO and LUO to transfer its memory contents to the next kernel. This is useful in many ways. For one, it can be used with IOMMUFD as the backing store for IOMMU page tables. Preserving IOMMUFD is essential for performing a hypervisor live update with passthrough devices. memfd support provides the first building block for making that possible. For another, applications with a large amount of memory that takes time to reconstruct, reboots to consume kernel upgrades can be very expensive. memfd with LUO gives those applications reboot-persistent memory that they can use to quickly save and reconstruct that state. While memfd is backed by either hugetlbfs or shmem, currently only support on shmem is added. To be more precise, support for anonymous shmem files is added. The handover to the next kernel is not transparent. All the properties of the file are not preserved; only its memory contents, position, and size. The recreated file gets the UID and GID of the task doing the restore, and the task's cgroup gets charged with the memory. Once preserved, the file cannot grow or shrink, and all its pages are pinned to avoid migrations and swapping. The file can still be read from or written to. Use vmalloc to get the buffer to hold the folios, and preserve it using kho_preserve_vmalloc(). This doesn't have the size limit. Link: https://lkml.kernel.org/r/20251125165850.3389713-15-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- MAINTAINERS | 2 + include/linux/kho/abi/memfd.h | 77 +++++ mm/Makefile | 1 + mm/memfd_luo.c | 516 ++++++++++++++++++++++++++++++++++ 4 files changed, 596 insertions(+) create mode 100644 include/linux/kho/abi/memfd.h create mode 100644 mm/memfd_luo.c diff --git a/MAINTAINERS b/MAINTAINERS index 868d3d23fdea..425c46bba764 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14469,6 +14469,7 @@ F: tools/testing/selftests/livepatch/ LIVE UPDATE M: Pasha Tatashin M: Mike Rapoport +R: Pratyush Yadav L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/core-api/liveupdate.rst @@ -14477,6 +14478,7 @@ F: include/linux/liveupdate.h F: include/linux/liveupdate/ F: include/uapi/linux/liveupdate.h F: kernel/liveupdate/ +F: mm/memfd_luo.c LLC (802.2) L: netdev@vger.kernel.org diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h new file mode 100644 index 000000000000..da7d063474a1 --- /dev/null +++ b/include/linux/kho/abi/memfd.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + * + * Copyright (C) 2025 Amazon.com Inc. or its affiliates. + * Pratyush Yadav + */ + +#ifndef _LINUX_KHO_ABI_MEMFD_H +#define _LINUX_KHO_ABI_MEMFD_H + +#include +#include + +/** + * DOC: memfd Live Update ABI + * + * This header defines the ABI for preserving the state of a memfd across a + * kexec reboot using the LUO. + * + * The state is serialized into a packed structure `struct memfd_luo_ser` + * which is handed over to the next kernel via the KHO mechanism. + * + * This interface is a contract. Any modification to the structure layout + * constitutes a breaking change. Such changes require incrementing the + * version number in the MEMFD_LUO_FH_COMPATIBLE string. + */ + +/** + * MEMFD_LUO_FOLIO_DIRTY - The folio is dirty. + * + * This flag indicates the folio contains data from user. A non-dirty folio is + * one that was allocated (say using fallocate(2)) but not written to. + */ +#define MEMFD_LUO_FOLIO_DIRTY BIT(0) + +/** + * MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date. + * + * An up-to-date folio has been zeroed out. shmem zeroes out folios on first + * use. This flag tracks which folios need zeroing. + */ +#define MEMFD_LUO_FOLIO_UPTODATE BIT(1) + +/** + * struct memfd_luo_folio_ser - Serialized state of a single folio. + * @pfn: The page frame number of the folio. + * @flags: Flags to describe the state of the folio. + * @index: The page offset (pgoff_t) of the folio within the original file. + */ +struct memfd_luo_folio_ser { + u64 pfn:52; + u64 flags:12; + u64 index; +} __packed; + +/** + * struct memfd_luo_ser - Main serialization structure for a memfd. + * @pos: The file's current position (f_pos). + * @size: The total size of the file in bytes (i_size). + * @nr_folios: Number of folios in the folios array. + * @folios: KHO vmalloc descriptor pointing to the array of + * struct memfd_luo_folio_ser. + */ +struct memfd_luo_ser { + u64 pos; + u64 size; + u64 nr_folios; + struct kho_vmalloc folios; +} __packed; + +/* The compatibility string for memfd file handler */ +#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1" + +#endif /* _LINUX_KHO_ABI_MEMFD_H */ diff --git a/mm/Makefile b/mm/Makefile index 21abb3353550..7738ec416f00 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o +obj-$(CONFIG_LIVEUPDATE) += memfd_luo.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c new file mode 100644 index 000000000000..4f6ba63b4310 --- /dev/null +++ b/mm/memfd_luo.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + * + * Copyright (C) 2025 Amazon.com Inc. or its affiliates. + * Pratyush Yadav + */ + +/** + * DOC: Memfd Preservation via LUO + * + * Overview + * ======== + * + * Memory file descriptors (memfd) can be preserved over a kexec using the Live + * Update Orchestrator (LUO) file preservation. This allows userspace to + * transfer its memory contents to the next kernel after a kexec. + * + * The preservation is not intended to be transparent. Only select properties of + * the file are preserved. All others are reset to default. The preserved + * properties are described below. + * + * .. note:: + * The LUO API is not stabilized yet, so the preserved properties of a memfd + * are also not stable and are subject to backwards incompatible changes. + * + * .. note:: + * Currently a memfd backed by Hugetlb is not supported. Memfds created + * with ``MFD_HUGETLB`` will be rejected. + * + * Preserved Properties + * ==================== + * + * The following properties of the memfd are preserved across kexec: + * + * File Contents + * All data stored in the file is preserved. + * + * File Size + * The size of the file is preserved. Holes in the file are filled by + * allocating pages for them during preservation. + * + * File Position + * The current file position is preserved, allowing applications to continue + * reading/writing from their last position. + * + * File Status Flags + * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property + * is maintained. + * + * Non-Preserved Properties + * ======================== + * + * All properties which are not preserved must be assumed to be reset to + * default. This section describes some of those properties which may be more of + * note. + * + * ``FD_CLOEXEC`` flag + * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the + * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set + * again after restore via ``fcntl()``. + * + * Seals + * File seals are not preserved. The file is unsealed on restore and if + * needed, must be sealed again via ``fcntl()``. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int memfd_luo_preserve_folios(struct file *file, + struct kho_vmalloc *kho_vmalloc, + struct memfd_luo_folio_ser **out_folios_ser, + u64 *nr_foliosp) +{ + struct inode *inode = file_inode(file); + struct memfd_luo_folio_ser *folios_ser; + unsigned int max_folios; + long i, size, nr_pinned; + struct folio **folios; + int err = -EINVAL; + pgoff_t offset; + u64 nr_folios; + + size = i_size_read(inode); + /* + * If the file has zero size, then the folios and nr_folios properties + * are not set. + */ + if (!size) { + *nr_foliosp = 0; + *out_folios_ser = NULL; + memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); + return 0; + } + + /* + * Guess the number of folios based on inode size. Real number might end + * up being smaller if there are higher order folios. + */ + max_folios = PAGE_ALIGN(size) / PAGE_SIZE; + folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL); + if (!folios) + return -ENOMEM; + + /* + * Pin the folios so they don't move around behind our back. This also + * ensures none of the folios are in CMA -- which ensures they don't + * fall in KHO scratch memory. It also moves swapped out folios back to + * memory. + * + * A side effect of doing this is that it allocates a folio for all + * indices in the file. This might waste memory on sparse memfds. If + * that is really a problem in the future, we can have a + * memfd_pin_folios() variant that does not allocate a page on empty + * slots. + */ + nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, + &offset); + if (nr_pinned < 0) { + err = nr_pinned; + pr_err("failed to pin folios: %d\n", err); + goto err_free_folios; + } + nr_folios = nr_pinned; + + folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); + if (!folios_ser) { + err = -ENOMEM; + goto err_unpin; + } + + for (i = 0; i < nr_folios; i++) { + struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + struct folio *folio = folios[i]; + unsigned int flags = 0; + + err = kho_preserve_folio(folio); + if (err) + goto err_unpreserve; + + if (folio_test_dirty(folio)) + flags |= MEMFD_LUO_FOLIO_DIRTY; + if (folio_test_uptodate(folio)) + flags |= MEMFD_LUO_FOLIO_UPTODATE; + + pfolio->pfn = folio_pfn(folio); + pfolio->flags = flags; + pfolio->index = folio->index; + } + + err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); + if (err) + goto err_unpreserve; + + kvfree(folios); + *nr_foliosp = nr_folios; + *out_folios_ser = folios_ser; + + /* + * Note: folios_ser is purposely not freed here. It is preserved + * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer + * that is passed via private_data. + */ + return 0; + +err_unpreserve: + for (i = i - 1; i >= 0; i--) + kho_unpreserve_folio(folios[i]); + vfree(folios_ser); +err_unpin: + unpin_folios(folios, nr_folios); +err_free_folios: + kvfree(folios); + + return err; +} + +static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, + struct memfd_luo_folio_ser *folios_ser, + u64 nr_folios) +{ + long i; + + if (!nr_folios) + return; + + kho_unpreserve_vmalloc(kho_vmalloc); + + for (i = 0; i < nr_folios; i++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + struct folio *folio; + + if (!pfolio->pfn) + continue; + + folio = pfn_folio(pfolio->pfn); + + kho_unpreserve_folio(folio); + unpin_folio(folio); + } + + vfree(folios_ser); +} + +static int memfd_luo_preserve(struct liveupdate_file_op_args *args) +{ + struct inode *inode = file_inode(args->file); + struct memfd_luo_folio_ser *folios_ser; + struct memfd_luo_ser *ser; + u64 nr_folios; + int err = 0; + + inode_lock(inode); + shmem_freeze(inode, true); + + /* Allocate the main serialization structure in preserved memory */ + ser = kho_alloc_preserve(sizeof(*ser)); + if (IS_ERR(ser)) { + err = PTR_ERR(ser); + goto err_unlock; + } + + ser->pos = args->file->f_pos; + ser->size = i_size_read(inode); + + err = memfd_luo_preserve_folios(args->file, &ser->folios, + &folios_ser, &nr_folios); + if (err) + goto err_free_ser; + + ser->nr_folios = nr_folios; + inode_unlock(inode); + + args->private_data = folios_ser; + args->serialized_data = virt_to_phys(ser); + + return 0; + +err_free_ser: + kho_unpreserve_free(ser); +err_unlock: + shmem_freeze(inode, false); + inode_unlock(inode); + return err; +} + +static int memfd_luo_freeze(struct liveupdate_file_op_args *args) +{ + struct memfd_luo_ser *ser; + + if (WARN_ON_ONCE(!args->serialized_data)) + return -EINVAL; + + ser = phys_to_virt(args->serialized_data); + + /* + * The pos might have changed since prepare. Everything else stays the + * same. + */ + ser->pos = args->file->f_pos; + + return 0; +} + +static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) +{ + struct inode *inode = file_inode(args->file); + struct memfd_luo_ser *ser; + + if (WARN_ON_ONCE(!args->serialized_data)) + return; + + inode_lock(inode); + shmem_freeze(inode, false); + + ser = phys_to_virt(args->serialized_data); + + memfd_luo_unpreserve_folios(&ser->folios, args->private_data, + ser->nr_folios); + + kho_unpreserve_free(ser); + inode_unlock(inode); +} + +static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, + u64 nr_folios) +{ + u64 i; + + for (i = 0; i < nr_folios; i++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + struct folio *folio; + phys_addr_t phys; + + if (!pfolio->pfn) + continue; + + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); + if (!folio) { + pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", + phys); + continue; + } + + folio_put(folio); + } +} + +static void memfd_luo_finish(struct liveupdate_file_op_args *args) +{ + struct memfd_luo_folio_ser *folios_ser; + struct memfd_luo_ser *ser; + + if (args->retrieved) + return; + + ser = phys_to_virt(args->serialized_data); + if (!ser) + return; + + if (ser->nr_folios) { + folios_ser = kho_restore_vmalloc(&ser->folios); + if (!folios_ser) + goto out; + + memfd_luo_discard_folios(folios_ser, ser->nr_folios); + vfree(folios_ser); + } + +out: + kho_restore_free(ser); +} + +static int memfd_luo_retrieve_folios(struct file *file, + struct memfd_luo_folio_ser *folios_ser, + u64 nr_folios) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + int err = -EIO; + long i; + + for (i = 0; i < nr_folios; i++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + phys_addr_t phys; + u64 index; + int flags; + + if (!pfolio->pfn) + continue; + + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); + if (!folio) { + pr_err("Unable to restore folio at physical address: %llx\n", + phys); + goto put_folios; + } + index = pfolio->index; + flags = pfolio->flags; + + /* Set up the folio for insertion. */ + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); + if (err) { + pr_err("shmem: failed to charge folio index %ld: %d\n", + i, err); + goto unlock_folio; + } + + err = shmem_add_to_page_cache(folio, mapping, index, NULL, + mapping_gfp_mask(mapping)); + if (err) { + pr_err("shmem: failed to add to page cache folio index %ld: %d\n", + i, err); + goto unlock_folio; + } + + if (flags & MEMFD_LUO_FOLIO_UPTODATE) + folio_mark_uptodate(folio); + if (flags & MEMFD_LUO_FOLIO_DIRTY) + folio_mark_dirty(folio); + + err = shmem_inode_acct_blocks(inode, 1); + if (err) { + pr_err("shmem: failed to account folio index %ld: %d\n", + i, err); + goto unlock_folio; + } + + shmem_recalc_inode(inode, 1, 0); + folio_add_lru(folio); + folio_unlock(folio); + folio_put(folio); + } + + return 0; + +unlock_folio: + folio_unlock(folio); + folio_put(folio); +put_folios: + /* + * Note: don't free the folios already added to the file. They will be + * freed when the file is freed. Free the ones not added yet here. + */ + for (long j = i + 1; j < nr_folios; j++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; + + folio = kho_restore_folio(pfolio->pfn); + if (folio) + folio_put(folio); + } + + return err; +} + +static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) +{ + struct memfd_luo_folio_ser *folios_ser; + struct memfd_luo_ser *ser; + struct file *file; + int err; + + ser = phys_to_virt(args->serialized_data); + if (!ser) + return -EINVAL; + + file = shmem_file_setup("", 0, VM_NORESERVE); + + if (IS_ERR(file)) { + pr_err("failed to setup file: %pe\n", file); + return PTR_ERR(file); + } + + vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); + file->f_inode->i_size = ser->size; + + if (ser->nr_folios) { + folios_ser = kho_restore_vmalloc(&ser->folios); + if (!folios_ser) { + err = -EINVAL; + goto put_file; + } + + err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); + vfree(folios_ser); + if (err) + goto put_file; + } + + args->file = file; + kho_restore_free(ser); + + return 0; + +put_file: + fput(file); + + return err; +} + +static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, + struct file *file) +{ + struct inode *inode = file_inode(file); + + return shmem_file(file) && !inode->i_nlink; +} + +static const struct liveupdate_file_ops memfd_luo_file_ops = { + .freeze = memfd_luo_freeze, + .finish = memfd_luo_finish, + .retrieve = memfd_luo_retrieve, + .preserve = memfd_luo_preserve, + .unpreserve = memfd_luo_unpreserve, + .can_preserve = memfd_luo_can_preserve, + .owner = THIS_MODULE, +}; + +static struct liveupdate_file_handler memfd_luo_handler = { + .ops = &memfd_luo_file_ops, + .compatible = MEMFD_LUO_FH_COMPATIBLE, +}; + +static int __init memfd_luo_init(void) +{ + int err = liveupdate_register_file_handler(&memfd_luo_handler); + + if (err && err != -EOPNOTSUPP) { + pr_err("Could not register luo filesystem handler: %pe\n", + ERR_PTR(err)); + + return err; + } + + return 0; +} +late_initcall(memfd_luo_init); From 15fc11bb2cb649f2207bbb7140d5430d3937a8d5 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:45 -0500 Subject: [PATCH 123/139] docs: add documentation for memfd preservation via LUO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the documentation under the "Preserving file descriptors" section of LUO's documentation. Link: https://lkml.kernel.org/r/20251125165850.3389713-16-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- Documentation/core-api/liveupdate.rst | 7 +++++++ Documentation/mm/index.rst | 1 + Documentation/mm/memfd_preservation.rst | 23 +++++++++++++++++++++++ MAINTAINERS | 1 + 4 files changed, 32 insertions(+) create mode 100644 Documentation/mm/memfd_preservation.rst diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst index cca1993008d8..7960eb15a81f 100644 --- a/Documentation/core-api/liveupdate.rst +++ b/Documentation/core-api/liveupdate.rst @@ -23,6 +23,13 @@ Live Update Orchestrator ABI .. kernel-doc:: include/linux/kho/abi/luo.h :doc: Live Update Orchestrator ABI +The following types of file descriptors can be preserved + +.. toctree:: + :maxdepth: 1 + + ../mm/memfd_preservation + Public API ========== .. kernel-doc:: include/linux/liveupdate.h diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst index ba6a8872849b..7aa2a8886908 100644 --- a/Documentation/mm/index.rst +++ b/Documentation/mm/index.rst @@ -48,6 +48,7 @@ documentation, or deleted if it has served its purpose. hugetlbfs_reserv ksm memory-model + memfd_preservation mmu_notifier multigen_lru numa diff --git a/Documentation/mm/memfd_preservation.rst b/Documentation/mm/memfd_preservation.rst new file mode 100644 index 000000000000..66e0fb6d5ef0 --- /dev/null +++ b/Documentation/mm/memfd_preservation.rst @@ -0,0 +1,23 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +========================== +Memfd Preservation via LUO +========================== + +.. kernel-doc:: mm/memfd_luo.c + :doc: Memfd Preservation via LUO + +Memfd Preservation ABI +====================== + +.. kernel-doc:: include/linux/kho/abi/memfd.h + :doc: DOC: memfd Live Update ABI + +.. kernel-doc:: include/linux/kho/abi/memfd.h + :internal: + +See Also +======== + +- :doc:`/core-api/liveupdate` +- :doc:`/core-api/kho/concepts` diff --git a/MAINTAINERS b/MAINTAINERS index 425c46bba764..cabbf30d50e1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14473,6 +14473,7 @@ R: Pratyush Yadav L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/core-api/liveupdate.rst +F: Documentation/mm/memfd_preservation.rst F: Documentation/userspace-api/liveupdate.rst F: include/linux/liveupdate.h F: include/linux/liveupdate/ From 80bab43f6f235664fff2d3518b3901ba9c4ac5a3 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:46 -0500 Subject: [PATCH 124/139] selftests/liveupdate: add userspace API selftests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a selftest suite for LUO. These tests validate the core userspace-facing API provided by the /dev/liveupdate device and its associated ioctls. The suite covers fundamental device behavior, session management, and the file preservation mechanism using memfd as a test case. This provides regression testing for the LUO uAPI. The following functionality is verified: Device Access: Basic open and close operations on /dev/liveupdate. Enforcement of exclusive device access (verifying EBUSY on a second open). Session Management: Successful creation of sessions with unique names. Failure to create sessions with duplicate names. File Preservation: Preserving a single memfd and verifying its content remains intact post-preservation. Preserving multiple memfds within a single session, each with unique data. A complex scenario involving multiple sessions, each containing a mix of empty and data-filled memfds. Note: This test suite is limited to verifying the pre-kexec functionality of LUO (e.g., session creation, file preservation). The post-kexec restoration of resources is not covered, as the kselftest framework does not currently support orchestrating a reboot and continuing execution in the new kernel. Link: https://lkml.kernel.org/r/20251125165850.3389713-17-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/liveupdate/.gitignore | 9 + tools/testing/selftests/liveupdate/Makefile | 27 ++ tools/testing/selftests/liveupdate/config | 11 + .../testing/selftests/liveupdate/liveupdate.c | 348 ++++++++++++++++++ 6 files changed, 397 insertions(+) create mode 100644 tools/testing/selftests/liveupdate/.gitignore create mode 100644 tools/testing/selftests/liveupdate/Makefile create mode 100644 tools/testing/selftests/liveupdate/config create mode 100644 tools/testing/selftests/liveupdate/liveupdate.c diff --git a/MAINTAINERS b/MAINTAINERS index cabbf30d50e1..83bac6c48c98 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14480,6 +14480,7 @@ F: include/linux/liveupdate/ F: include/uapi/linux/liveupdate.h F: kernel/liveupdate/ F: mm/memfd_luo.c +F: tools/testing/selftests/liveupdate/ LLC (802.2) L: netdev@vger.kernel.org diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index c46ebdb9b8ef..56e44a98d6a5 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -54,6 +54,7 @@ TARGETS += kvm TARGETS += landlock TARGETS += lib TARGETS += livepatch +TARGETS += liveupdate TARGETS += lkdtm TARGETS += lsm TARGETS += membarrier diff --git a/tools/testing/selftests/liveupdate/.gitignore b/tools/testing/selftests/liveupdate/.gitignore new file mode 100644 index 000000000000..661827083ab6 --- /dev/null +++ b/tools/testing/selftests/liveupdate/.gitignore @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +* +!/**/ +!*.c +!*.h +!*.sh +!.gitignore +!config +!Makefile diff --git a/tools/testing/selftests/liveupdate/Makefile b/tools/testing/selftests/liveupdate/Makefile new file mode 100644 index 000000000000..620cb4ce85af --- /dev/null +++ b/tools/testing/selftests/liveupdate/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0-only + +TEST_GEN_PROGS += liveupdate + +include ../lib.mk + +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -Wall -O2 -Wno-unused-function +CFLAGS += -MD + +LIB_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIB_C)) +TEST_O := $(patsubst %, %.o, $(TEST_GEN_PROGS)) +TEST_O += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED)) + +TEST_DEP_FILES := $(patsubst %.o, %.d, $(LIB_O)) +TEST_DEP_FILES += $(patsubst %.o, %.d, $(TEST_O)) +-include $(TEST_DEP_FILES) + +$(LIB_O): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + +$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/%: %.o $(LIB_O) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIB_O) $(LDLIBS) -o $@ + +EXTRA_CLEAN += $(LIB_O) +EXTRA_CLEAN += $(TEST_O) +EXTRA_CLEAN += $(TEST_DEP_FILES) diff --git a/tools/testing/selftests/liveupdate/config b/tools/testing/selftests/liveupdate/config new file mode 100644 index 000000000000..91d03f9a6a39 --- /dev/null +++ b/tools/testing/selftests/liveupdate/config @@ -0,0 +1,11 @@ +CONFIG_BLK_DEV_INITRD=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_HANDOVER=y +CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT=y +CONFIG_KEXEC_HANDOVER_DEBUGFS=y +CONFIG_KEXEC_HANDOVER_DEBUG=y +CONFIG_LIVEUPDATE=y +CONFIG_LIVEUPDATE_TEST=y +CONFIG_MEMFD_CREATE=y +CONFIG_TMPFS=y +CONFIG_SHMEM=y diff --git a/tools/testing/selftests/liveupdate/liveupdate.c b/tools/testing/selftests/liveupdate/liveupdate.c new file mode 100644 index 000000000000..c2878e3d5ef9 --- /dev/null +++ b/tools/testing/selftests/liveupdate/liveupdate.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/* + * Selftests for the Live Update Orchestrator. + * This test suite verifies the functionality and behavior of the + * /dev/liveupdate character device and its session management capabilities. + * + * Tests include: + * - Device access: basic open/close, and enforcement of exclusive access. + * - Session management: creation of unique sessions, and duplicate name detection. + * - Resource preservation: successfully preserving individual and multiple memfds, + * verifying contents remain accessible. + * - Complex multi-session scenarios involving mixed empty and populated files. + */ + +#include +#include +#include +#include +#include + +#include + +#include "../kselftest.h" +#include "../kselftest_harness.h" + +#define LIVEUPDATE_DEV "/dev/liveupdate" + +FIXTURE(liveupdate_device) { + int fd1; + int fd2; +}; + +FIXTURE_SETUP(liveupdate_device) +{ + self->fd1 = -1; + self->fd2 = -1; +} + +FIXTURE_TEARDOWN(liveupdate_device) +{ + if (self->fd1 >= 0) + close(self->fd1); + if (self->fd2 >= 0) + close(self->fd2); +} + +/* + * Test Case: Basic Open and Close + * + * Verifies that the /dev/liveupdate device can be opened and subsequently + * closed without errors. Skips if the device does not exist. + */ +TEST_F(liveupdate_device, basic_open_close) +{ + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist.", LIVEUPDATE_DEV); + + ASSERT_GE(self->fd1, 0); + ASSERT_EQ(close(self->fd1), 0); + self->fd1 = -1; +} + +/* + * Test Case: Exclusive Open Enforcement + * + * Verifies that the /dev/liveupdate device can only be opened by one process + * at a time. It checks that a second attempt to open the device fails with + * the EBUSY error code. + */ +TEST_F(liveupdate_device, exclusive_open) +{ + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist.", LIVEUPDATE_DEV); + + ASSERT_GE(self->fd1, 0); + self->fd2 = open(LIVEUPDATE_DEV, O_RDWR); + EXPECT_LT(self->fd2, 0); + EXPECT_EQ(errno, EBUSY); +} + +/* Helper function to create a LUO session via ioctl. */ +static int create_session(int lu_fd, const char *name) +{ + struct liveupdate_ioctl_create_session args = {}; + + args.size = sizeof(args); + strncpy((char *)args.name, name, sizeof(args.name) - 1); + + if (ioctl(lu_fd, LIVEUPDATE_IOCTL_CREATE_SESSION, &args)) + return -errno; + + return args.fd; +} + +/* + * Test Case: Create Duplicate Session + * + * Verifies that attempting to create two sessions with the same name fails + * on the second attempt with EEXIST. + */ +TEST_F(liveupdate_device, create_duplicate_session) +{ + int session_fd1, session_fd2; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + + ASSERT_GE(self->fd1, 0); + + session_fd1 = create_session(self->fd1, "duplicate-session-test"); + ASSERT_GE(session_fd1, 0); + + session_fd2 = create_session(self->fd1, "duplicate-session-test"); + EXPECT_LT(session_fd2, 0); + EXPECT_EQ(-session_fd2, EEXIST); + + ASSERT_EQ(close(session_fd1), 0); +} + +/* + * Test Case: Create Distinct Sessions + * + * Verifies that creating two sessions with different names succeeds. + */ +TEST_F(liveupdate_device, create_distinct_sessions) +{ + int session_fd1, session_fd2; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + + ASSERT_GE(self->fd1, 0); + + session_fd1 = create_session(self->fd1, "distinct-session-1"); + ASSERT_GE(session_fd1, 0); + + session_fd2 = create_session(self->fd1, "distinct-session-2"); + ASSERT_GE(session_fd2, 0); + + ASSERT_EQ(close(session_fd1), 0); + ASSERT_EQ(close(session_fd2), 0); +} + +static int preserve_fd(int session_fd, int fd_to_preserve, __u64 token) +{ + struct liveupdate_session_preserve_fd args = {}; + + args.size = sizeof(args); + args.fd = fd_to_preserve; + args.token = token; + + if (ioctl(session_fd, LIVEUPDATE_SESSION_PRESERVE_FD, &args)) + return -errno; + + return 0; +} + +/* + * Test Case: Preserve MemFD + * + * Verifies that a valid memfd can be successfully preserved in a session and + * that its contents remain intact after the preservation call. + */ +TEST_F(liveupdate_device, preserve_memfd) +{ + const char *test_str = "hello liveupdate"; + char read_buf[64] = {}; + int session_fd, mem_fd; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, "preserve-memfd-test"); + ASSERT_GE(session_fd, 0); + + mem_fd = memfd_create("test-memfd", 0); + ASSERT_GE(mem_fd, 0); + + ASSERT_EQ(write(mem_fd, test_str, strlen(test_str)), strlen(test_str)); + ASSERT_EQ(preserve_fd(session_fd, mem_fd, 0x1234), 0); + ASSERT_EQ(close(session_fd), 0); + + ASSERT_EQ(lseek(mem_fd, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd, read_buf, sizeof(read_buf)), strlen(test_str)); + ASSERT_STREQ(read_buf, test_str); + ASSERT_EQ(close(mem_fd), 0); +} + +/* + * Test Case: Preserve Multiple MemFDs + * + * Verifies that multiple memfds can be preserved in a single session, + * each with a unique token, and that their contents remain distinct and + * correct after preservation. + */ +TEST_F(liveupdate_device, preserve_multiple_memfds) +{ + const char *test_str1 = "data for memfd one"; + const char *test_str2 = "data for memfd two"; + char read_buf[64] = {}; + int session_fd, mem_fd1, mem_fd2; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, "preserve-multi-memfd-test"); + ASSERT_GE(session_fd, 0); + + mem_fd1 = memfd_create("test-memfd-1", 0); + ASSERT_GE(mem_fd1, 0); + mem_fd2 = memfd_create("test-memfd-2", 0); + ASSERT_GE(mem_fd2, 0); + + ASSERT_EQ(write(mem_fd1, test_str1, strlen(test_str1)), strlen(test_str1)); + ASSERT_EQ(write(mem_fd2, test_str2, strlen(test_str2)), strlen(test_str2)); + + ASSERT_EQ(preserve_fd(session_fd, mem_fd1, 0xAAAA), 0); + ASSERT_EQ(preserve_fd(session_fd, mem_fd2, 0xBBBB), 0); + + memset(read_buf, 0, sizeof(read_buf)); + ASSERT_EQ(lseek(mem_fd1, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd1, read_buf, sizeof(read_buf)), strlen(test_str1)); + ASSERT_STREQ(read_buf, test_str1); + + memset(read_buf, 0, sizeof(read_buf)); + ASSERT_EQ(lseek(mem_fd2, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd2, read_buf, sizeof(read_buf)), strlen(test_str2)); + ASSERT_STREQ(read_buf, test_str2); + + ASSERT_EQ(close(mem_fd1), 0); + ASSERT_EQ(close(mem_fd2), 0); + ASSERT_EQ(close(session_fd), 0); +} + +/* + * Test Case: Preserve Complex Scenario + * + * Verifies a more complex scenario with multiple sessions and a mix of empty + * and non-empty memfds distributed across them. + */ +TEST_F(liveupdate_device, preserve_complex_scenario) +{ + const char *data1 = "data for session 1"; + const char *data2 = "data for session 2"; + char read_buf[64] = {}; + int session_fd1, session_fd2; + int mem_fd_data1, mem_fd_empty1, mem_fd_data2, mem_fd_empty2; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd1 = create_session(self->fd1, "complex-session-1"); + ASSERT_GE(session_fd1, 0); + session_fd2 = create_session(self->fd1, "complex-session-2"); + ASSERT_GE(session_fd2, 0); + + mem_fd_data1 = memfd_create("data1", 0); + ASSERT_GE(mem_fd_data1, 0); + ASSERT_EQ(write(mem_fd_data1, data1, strlen(data1)), strlen(data1)); + + mem_fd_empty1 = memfd_create("empty1", 0); + ASSERT_GE(mem_fd_empty1, 0); + + mem_fd_data2 = memfd_create("data2", 0); + ASSERT_GE(mem_fd_data2, 0); + ASSERT_EQ(write(mem_fd_data2, data2, strlen(data2)), strlen(data2)); + + mem_fd_empty2 = memfd_create("empty2", 0); + ASSERT_GE(mem_fd_empty2, 0); + + ASSERT_EQ(preserve_fd(session_fd1, mem_fd_data1, 0x1111), 0); + ASSERT_EQ(preserve_fd(session_fd1, mem_fd_empty1, 0x2222), 0); + ASSERT_EQ(preserve_fd(session_fd2, mem_fd_data2, 0x3333), 0); + ASSERT_EQ(preserve_fd(session_fd2, mem_fd_empty2, 0x4444), 0); + + ASSERT_EQ(lseek(mem_fd_data1, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd_data1, read_buf, sizeof(read_buf)), strlen(data1)); + ASSERT_STREQ(read_buf, data1); + + memset(read_buf, 0, sizeof(read_buf)); + ASSERT_EQ(lseek(mem_fd_data2, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd_data2, read_buf, sizeof(read_buf)), strlen(data2)); + ASSERT_STREQ(read_buf, data2); + + ASSERT_EQ(lseek(mem_fd_empty1, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd_empty1, read_buf, sizeof(read_buf)), 0); + + ASSERT_EQ(lseek(mem_fd_empty2, 0, SEEK_SET), 0); + ASSERT_EQ(read(mem_fd_empty2, read_buf, sizeof(read_buf)), 0); + + ASSERT_EQ(close(mem_fd_data1), 0); + ASSERT_EQ(close(mem_fd_empty1), 0); + ASSERT_EQ(close(mem_fd_data2), 0); + ASSERT_EQ(close(mem_fd_empty2), 0); + ASSERT_EQ(close(session_fd1), 0); + ASSERT_EQ(close(session_fd2), 0); +} + +/* + * Test Case: Preserve Unsupported File Descriptor + * + * Verifies that attempting to preserve a file descriptor that does not have + * a registered Live Update handler fails gracefully. + * Uses /dev/null as a representative of a file type (character device) + * that is not supported by the orchestrator. + */ +TEST_F(liveupdate_device, preserve_unsupported_fd) +{ + int session_fd, unsupported_fd; + int ret; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, "unsupported-fd-test"); + ASSERT_GE(session_fd, 0); + + unsupported_fd = open("/dev/null", O_RDWR); + ASSERT_GE(unsupported_fd, 0); + + ret = preserve_fd(session_fd, unsupported_fd, 0xDEAD); + EXPECT_EQ(ret, -ENOENT); + + ASSERT_EQ(close(unsupported_fd), 0); + ASSERT_EQ(close(session_fd), 0); +} + +TEST_HARNESS_MAIN From a003bdb9ec4e12c869b9c8641dd9bebf90ac0389 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:47 -0500 Subject: [PATCH 125/139] selftests/liveupdate: add simple kexec-based selftest for LUO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a kexec-based selftest, luo_kexec_simple, to validate the end-to-end lifecycle of a Live Update Orchestrator session across a reboot. While existing tests verify the uAPI in a pre-reboot context, this test ensures that the core functionality—preserving state via Kexec Handover and restoring it in a new kernel—works as expected. The test operates in two stages, managing its state across the reboot by preserving a dedicated "state session" containing a memfd. This mechanism dogfoods the LUO feature itself for state tracking, making the test self-contained. The test validates the following sequence: Stage 1 (Pre-kexec): - Creates a test session (test-session). - Creates and preserves a memfd with a known data pattern into the test session. - Creates the state-tracking session to signal progression to Stage 2. - Executes a kexec reboot via a helper script. Stage 2 (Post-kexec): - Retrieves the state-tracking session to confirm it is in the post-reboot stage. - Retrieves the preserved test session. - Restores the memfd from the test session and verifies its contents match the original data pattern written in Stage 1. - Finalizes both the test and state sessions to ensure a clean teardown. The test relies on a helper script (do_kexec.sh) to perform the reboot and a shared utility library (luo_test_utils.c) for common LUO operations, keeping the main test logic clean and focused. Link: https://lkml.kernel.org/r/20251125165850.3389713-18-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Zhu Yanjun Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zijun Hu Signed-off-by: Andrew Morton --- tools/testing/selftests/liveupdate/Makefile | 6 + .../testing/selftests/liveupdate/do_kexec.sh | 16 ++ .../selftests/liveupdate/luo_kexec_simple.c | 89 ++++++ .../selftests/liveupdate/luo_test_utils.c | 266 ++++++++++++++++++ .../selftests/liveupdate/luo_test_utils.h | 44 +++ 5 files changed, 421 insertions(+) create mode 100755 tools/testing/selftests/liveupdate/do_kexec.sh create mode 100644 tools/testing/selftests/liveupdate/luo_kexec_simple.c create mode 100644 tools/testing/selftests/liveupdate/luo_test_utils.c create mode 100644 tools/testing/selftests/liveupdate/luo_test_utils.h diff --git a/tools/testing/selftests/liveupdate/Makefile b/tools/testing/selftests/liveupdate/Makefile index 620cb4ce85af..bbbec633970c 100644 --- a/tools/testing/selftests/liveupdate/Makefile +++ b/tools/testing/selftests/liveupdate/Makefile @@ -1,7 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only +LIB_C += luo_test_utils.c + TEST_GEN_PROGS += liveupdate +TEST_GEN_PROGS_EXTENDED += luo_kexec_simple + +TEST_FILES += do_kexec.sh + include ../lib.mk CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/liveupdate/do_kexec.sh b/tools/testing/selftests/liveupdate/do_kexec.sh new file mode 100755 index 000000000000..3c7c6cafbef8 --- /dev/null +++ b/tools/testing/selftests/liveupdate/do_kexec.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +set -e + +# Use $KERNEL and $INITRAMFS to pass custom Kernel and optional initramfs + +KERNEL="${KERNEL:-/boot/bzImage}" +set -- -l -s --reuse-cmdline "$KERNEL" + +INITRAMFS="${INITRAMFS:-/boot/initramfs}" +if [ -f "$INITRAMFS" ]; then + set -- "$@" --initrd="$INITRAMFS" +fi + +kexec "$@" +kexec -e diff --git a/tools/testing/selftests/liveupdate/luo_kexec_simple.c b/tools/testing/selftests/liveupdate/luo_kexec_simple.c new file mode 100644 index 000000000000..d7ac1f3dc4cb --- /dev/null +++ b/tools/testing/selftests/liveupdate/luo_kexec_simple.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + * + * A simple selftest to validate the end-to-end lifecycle of a LUO session + * across a single kexec reboot. + */ + +#include "luo_test_utils.h" + +#define TEST_SESSION_NAME "test-session" +#define TEST_MEMFD_TOKEN 0x1A +#define TEST_MEMFD_DATA "hello kexec world" + +/* Constants for the state-tracking mechanism, specific to this test file. */ +#define STATE_SESSION_NAME "kexec_simple_state" +#define STATE_MEMFD_TOKEN 999 + +/* Stage 1: Executed before the kexec reboot. */ +static void run_stage_1(int luo_fd) +{ + int session_fd; + + ksft_print_msg("[STAGE 1] Starting pre-kexec setup...\n"); + + ksft_print_msg("[STAGE 1] Creating state file for next stage (2)...\n"); + create_state_file(luo_fd, STATE_SESSION_NAME, STATE_MEMFD_TOKEN, 2); + + ksft_print_msg("[STAGE 1] Creating session '%s' and preserving memfd...\n", + TEST_SESSION_NAME); + session_fd = luo_create_session(luo_fd, TEST_SESSION_NAME); + if (session_fd < 0) + fail_exit("luo_create_session for '%s'", TEST_SESSION_NAME); + + if (create_and_preserve_memfd(session_fd, TEST_MEMFD_TOKEN, + TEST_MEMFD_DATA) < 0) { + fail_exit("create_and_preserve_memfd for token %#x", + TEST_MEMFD_TOKEN); + } + + close(luo_fd); + daemonize_and_wait(); +} + +/* Stage 2: Executed after the kexec reboot. */ +static void run_stage_2(int luo_fd, int state_session_fd) +{ + int session_fd, mfd, stage; + + ksft_print_msg("[STAGE 2] Starting post-kexec verification...\n"); + + restore_and_read_stage(state_session_fd, STATE_MEMFD_TOKEN, &stage); + if (stage != 2) + fail_exit("Expected stage 2, but state file contains %d", stage); + + ksft_print_msg("[STAGE 2] Retrieving session '%s'...\n", TEST_SESSION_NAME); + session_fd = luo_retrieve_session(luo_fd, TEST_SESSION_NAME); + if (session_fd < 0) + fail_exit("luo_retrieve_session for '%s'", TEST_SESSION_NAME); + + ksft_print_msg("[STAGE 2] Restoring and verifying memfd (token %#x)...\n", + TEST_MEMFD_TOKEN); + mfd = restore_and_verify_memfd(session_fd, TEST_MEMFD_TOKEN, + TEST_MEMFD_DATA); + if (mfd < 0) + fail_exit("restore_and_verify_memfd for token %#x", TEST_MEMFD_TOKEN); + close(mfd); + + ksft_print_msg("[STAGE 2] Test data verified successfully.\n"); + ksft_print_msg("[STAGE 2] Finalizing test session...\n"); + if (luo_session_finish(session_fd) < 0) + fail_exit("luo_session_finish for test session"); + close(session_fd); + + ksft_print_msg("[STAGE 2] Finalizing state session...\n"); + if (luo_session_finish(state_session_fd) < 0) + fail_exit("luo_session_finish for state session"); + close(state_session_fd); + + ksft_print_msg("\n--- SIMPLE KEXEC TEST PASSED ---\n"); +} + +int main(int argc, char *argv[]) +{ + return luo_test(argc, argv, STATE_SESSION_NAME, + run_stage_1, run_stage_2); +} diff --git a/tools/testing/selftests/liveupdate/luo_test_utils.c b/tools/testing/selftests/liveupdate/luo_test_utils.c new file mode 100644 index 000000000000..3c8721c505df --- /dev/null +++ b/tools/testing/selftests/liveupdate/luo_test_utils.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "luo_test_utils.h" + +int luo_open_device(void) +{ + return open(LUO_DEVICE, O_RDWR); +} + +int luo_create_session(int luo_fd, const char *name) +{ + struct liveupdate_ioctl_create_session arg = { .size = sizeof(arg) }; + + snprintf((char *)arg.name, LIVEUPDATE_SESSION_NAME_LENGTH, "%.*s", + LIVEUPDATE_SESSION_NAME_LENGTH - 1, name); + + if (ioctl(luo_fd, LIVEUPDATE_IOCTL_CREATE_SESSION, &arg) < 0) + return -errno; + + return arg.fd; +} + +int luo_retrieve_session(int luo_fd, const char *name) +{ + struct liveupdate_ioctl_retrieve_session arg = { .size = sizeof(arg) }; + + snprintf((char *)arg.name, LIVEUPDATE_SESSION_NAME_LENGTH, "%.*s", + LIVEUPDATE_SESSION_NAME_LENGTH - 1, name); + + if (ioctl(luo_fd, LIVEUPDATE_IOCTL_RETRIEVE_SESSION, &arg) < 0) + return -errno; + + return arg.fd; +} + +int create_and_preserve_memfd(int session_fd, int token, const char *data) +{ + struct liveupdate_session_preserve_fd arg = { .size = sizeof(arg) }; + long page_size = sysconf(_SC_PAGE_SIZE); + void *map = MAP_FAILED; + int mfd = -1, ret = -1; + + mfd = memfd_create("test_mfd", 0); + if (mfd < 0) + return -errno; + + if (ftruncate(mfd, page_size) != 0) + goto out; + + map = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, mfd, 0); + if (map == MAP_FAILED) + goto out; + + snprintf(map, page_size, "%s", data); + munmap(map, page_size); + + arg.fd = mfd; + arg.token = token; + if (ioctl(session_fd, LIVEUPDATE_SESSION_PRESERVE_FD, &arg) < 0) + goto out; + + ret = 0; +out: + if (ret != 0 && errno != 0) + ret = -errno; + if (mfd >= 0) + close(mfd); + return ret; +} + +int restore_and_verify_memfd(int session_fd, int token, + const char *expected_data) +{ + struct liveupdate_session_retrieve_fd arg = { .size = sizeof(arg) }; + long page_size = sysconf(_SC_PAGE_SIZE); + void *map = MAP_FAILED; + int mfd = -1, ret = -1; + + arg.token = token; + if (ioctl(session_fd, LIVEUPDATE_SESSION_RETRIEVE_FD, &arg) < 0) + return -errno; + mfd = arg.fd; + + map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, mfd, 0); + if (map == MAP_FAILED) + goto out; + + if (expected_data && strcmp(expected_data, map) != 0) { + ksft_print_msg("Data mismatch! Expected '%s', Got '%s'\n", + expected_data, (char *)map); + ret = -EINVAL; + goto out_munmap; + } + + ret = mfd; +out_munmap: + munmap(map, page_size); +out: + if (ret < 0 && errno != 0) + ret = -errno; + if (ret < 0 && mfd >= 0) + close(mfd); + return ret; +} + +int luo_session_finish(int session_fd) +{ + struct liveupdate_session_finish arg = { .size = sizeof(arg) }; + + if (ioctl(session_fd, LIVEUPDATE_SESSION_FINISH, &arg) < 0) + return -errno; + + return 0; +} + +void create_state_file(int luo_fd, const char *session_name, int token, + int next_stage) +{ + char buf[32]; + int state_session_fd; + + state_session_fd = luo_create_session(luo_fd, session_name); + if (state_session_fd < 0) + fail_exit("luo_create_session for state tracking"); + + snprintf(buf, sizeof(buf), "%d", next_stage); + if (create_and_preserve_memfd(state_session_fd, token, buf) < 0) + fail_exit("create_and_preserve_memfd for state tracking"); + + /* + * DO NOT close session FD, otherwise it is going to be unpreserved + */ +} + +void restore_and_read_stage(int state_session_fd, int token, int *stage) +{ + char buf[32] = {0}; + int mfd; + + mfd = restore_and_verify_memfd(state_session_fd, token, NULL); + if (mfd < 0) + fail_exit("failed to restore state memfd"); + + if (read(mfd, buf, sizeof(buf) - 1) < 0) + fail_exit("failed to read state mfd"); + + *stage = atoi(buf); + + close(mfd); +} + +void daemonize_and_wait(void) +{ + pid_t pid; + + ksft_print_msg("[STAGE 1] Forking persistent child to hold sessions...\n"); + + pid = fork(); + if (pid < 0) + fail_exit("fork failed"); + + if (pid > 0) { + ksft_print_msg("[STAGE 1] Child PID: %d. Resources are pinned.\n", pid); + ksft_print_msg("[STAGE 1] You may now perform kexec reboot.\n"); + exit(EXIT_SUCCESS); + } + + /* Detach from terminal so closing the window doesn't kill us */ + if (setsid() < 0) + fail_exit("setsid failed"); + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + + /* Change dir to root to avoid locking filesystems */ + if (chdir("/") < 0) + exit(EXIT_FAILURE); + + while (1) + sleep(60); +} + +static int parse_stage_args(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"stage", required_argument, 0, 's'}, + {0, 0, 0, 0} + }; + int option_index = 0; + int stage = 1; + int opt; + + optind = 1; + while ((opt = getopt_long(argc, argv, "s:", long_options, &option_index)) != -1) { + switch (opt) { + case 's': + stage = atoi(optarg); + if (stage != 1 && stage != 2) + fail_exit("Invalid stage argument"); + break; + default: + fail_exit("Unknown argument"); + } + } + return stage; +} + +int luo_test(int argc, char *argv[], + const char *state_session_name, + luo_test_stage1_fn stage1, + luo_test_stage2_fn stage2) +{ + int target_stage = parse_stage_args(argc, argv); + int luo_fd = luo_open_device(); + int state_session_fd; + int detected_stage; + + if (luo_fd < 0) { + ksft_exit_skip("Failed to open %s. Is the luo module loaded?\n", + LUO_DEVICE); + } + + state_session_fd = luo_retrieve_session(luo_fd, state_session_name); + if (state_session_fd == -ENOENT) + detected_stage = 1; + else if (state_session_fd >= 0) + detected_stage = 2; + else + fail_exit("Failed to check for state session"); + + if (target_stage != detected_stage) { + ksft_exit_fail_msg("Stage mismatch Requested --stage %d, but system is in stage %d.\n" + "(State session %s: %s)\n", + target_stage, detected_stage, state_session_name, + (detected_stage == 2) ? "EXISTS" : "MISSING"); + } + + if (target_stage == 1) + stage1(luo_fd); + else + stage2(luo_fd, state_session_fd); + + return 0; +} diff --git a/tools/testing/selftests/liveupdate/luo_test_utils.h b/tools/testing/selftests/liveupdate/luo_test_utils.h new file mode 100644 index 000000000000..90099bf49577 --- /dev/null +++ b/tools/testing/selftests/liveupdate/luo_test_utils.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + * + * Utility functions for LUO kselftests. + */ + +#ifndef LUO_TEST_UTILS_H +#define LUO_TEST_UTILS_H + +#include +#include +#include +#include "../kselftest.h" + +#define LUO_DEVICE "/dev/liveupdate" + +#define fail_exit(fmt, ...) \ + ksft_exit_fail_msg("[%s:%d] " fmt " (errno: %s)\n", \ + __func__, __LINE__, ##__VA_ARGS__, strerror(errno)) + +int luo_open_device(void); +int luo_create_session(int luo_fd, const char *name); +int luo_retrieve_session(int luo_fd, const char *name); +int luo_session_finish(int session_fd); + +int create_and_preserve_memfd(int session_fd, int token, const char *data); +int restore_and_verify_memfd(int session_fd, int token, const char *expected_data); + +void create_state_file(int luo_fd, const char *session_name, int token, + int next_stage); +void restore_and_read_stage(int state_session_fd, int token, int *stage); + +void daemonize_and_wait(void); + +typedef void (*luo_test_stage1_fn)(int luo_fd); +typedef void (*luo_test_stage2_fn)(int luo_fd, int state_session_fd); + +int luo_test(int argc, char *argv[], const char *state_session_name, + luo_test_stage1_fn stage1, luo_test_stage2_fn stage2); + +#endif /* LUO_TEST_UTILS_H */ From 724bf8c5595a1219427f6e779563859d63948b5a Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:48 -0500 Subject: [PATCH 126/139] selftests/liveupdate: add kexec test for multiple and empty sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new kexec-based selftest, luo_kexec_multi_session, to validate the end-to-end lifecycle of a more complex LUO scenario. While the existing luo_kexec_simple test covers the basic end-to-end lifecycle, it is limited to a single session with one preserved file. This new test significantly expands coverage by verifying LUO's ability to handle a mixed workload involving multiple sessions, some of which are intentionally empty. This ensures that the LUO core correctly preserves and restores the state of all session types across a reboot. The test validates the following sequence: Stage 1 (Pre-kexec): - Creates two empty test sessions (multi-test-empty-1, multi-test-empty-2). - Creates a session with one preserved memfd (multi-test-files-1). - Creates another session with two preserved memfds (multi-test-files-2), each containing unique data. - Creates a state-tracking session to manage the transition to Stage 2. - Executes a kexec reboot via the helper script. Stage 2 (Post-kexec): - Retrieves the state-tracking session to confirm it is in the post-reboot stage. - Retrieves all four test sessions (both the empty and non-empty ones). - For the non-empty sessions, restores the preserved memfds and verifies their contents match the original data patterns. - Finalizes all test sessions and the state session to ensure a clean teardown and that all associated kernel resources are correctly released. This test provides greater confidence in the robustness of the LUO framework by validating its behavior in a more realistic, multi-faceted scenario. Link: https://lkml.kernel.org/r/20251125165850.3389713-19-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- tools/testing/selftests/liveupdate/Makefile | 1 + .../selftests/liveupdate/luo_multi_session.c | 162 ++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 tools/testing/selftests/liveupdate/luo_multi_session.c diff --git a/tools/testing/selftests/liveupdate/Makefile b/tools/testing/selftests/liveupdate/Makefile index bbbec633970c..080754787ede 100644 --- a/tools/testing/selftests/liveupdate/Makefile +++ b/tools/testing/selftests/liveupdate/Makefile @@ -5,6 +5,7 @@ LIB_C += luo_test_utils.c TEST_GEN_PROGS += liveupdate TEST_GEN_PROGS_EXTENDED += luo_kexec_simple +TEST_GEN_PROGS_EXTENDED += luo_multi_session TEST_FILES += do_kexec.sh diff --git a/tools/testing/selftests/liveupdate/luo_multi_session.c b/tools/testing/selftests/liveupdate/luo_multi_session.c new file mode 100644 index 000000000000..0ee2d795beef --- /dev/null +++ b/tools/testing/selftests/liveupdate/luo_multi_session.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + * + * A selftest to validate the end-to-end lifecycle of multiple LUO sessions + * across a kexec reboot, including empty sessions and sessions with multiple + * files. + */ + +#include "luo_test_utils.h" + +#define SESSION_EMPTY_1 "multi-test-empty-1" +#define SESSION_EMPTY_2 "multi-test-empty-2" +#define SESSION_FILES_1 "multi-test-files-1" +#define SESSION_FILES_2 "multi-test-files-2" + +#define MFD1_TOKEN 0x1001 +#define MFD2_TOKEN 0x2002 +#define MFD3_TOKEN 0x3003 + +#define MFD1_DATA "Data for session files 1" +#define MFD2_DATA "First file for session files 2" +#define MFD3_DATA "Second file for session files 2" + +#define STATE_SESSION_NAME "kexec_multi_state" +#define STATE_MEMFD_TOKEN 998 + +/* Stage 1: Executed before the kexec reboot. */ +static void run_stage_1(int luo_fd) +{ + int s_empty1_fd, s_empty2_fd, s_files1_fd, s_files2_fd; + + ksft_print_msg("[STAGE 1] Starting pre-kexec setup for multi-session test...\n"); + + ksft_print_msg("[STAGE 1] Creating state file for next stage (2)...\n"); + create_state_file(luo_fd, STATE_SESSION_NAME, STATE_MEMFD_TOKEN, 2); + + ksft_print_msg("[STAGE 1] Creating empty sessions '%s' and '%s'...\n", + SESSION_EMPTY_1, SESSION_EMPTY_2); + s_empty1_fd = luo_create_session(luo_fd, SESSION_EMPTY_1); + if (s_empty1_fd < 0) + fail_exit("luo_create_session for '%s'", SESSION_EMPTY_1); + + s_empty2_fd = luo_create_session(luo_fd, SESSION_EMPTY_2); + if (s_empty2_fd < 0) + fail_exit("luo_create_session for '%s'", SESSION_EMPTY_2); + + ksft_print_msg("[STAGE 1] Creating session '%s' with one memfd...\n", + SESSION_FILES_1); + + s_files1_fd = luo_create_session(luo_fd, SESSION_FILES_1); + if (s_files1_fd < 0) + fail_exit("luo_create_session for '%s'", SESSION_FILES_1); + if (create_and_preserve_memfd(s_files1_fd, MFD1_TOKEN, MFD1_DATA) < 0) { + fail_exit("create_and_preserve_memfd for token %#x", + MFD1_TOKEN); + } + + ksft_print_msg("[STAGE 1] Creating session '%s' with two memfds...\n", + SESSION_FILES_2); + + s_files2_fd = luo_create_session(luo_fd, SESSION_FILES_2); + if (s_files2_fd < 0) + fail_exit("luo_create_session for '%s'", SESSION_FILES_2); + if (create_and_preserve_memfd(s_files2_fd, MFD2_TOKEN, MFD2_DATA) < 0) { + fail_exit("create_and_preserve_memfd for token %#x", + MFD2_TOKEN); + } + if (create_and_preserve_memfd(s_files2_fd, MFD3_TOKEN, MFD3_DATA) < 0) { + fail_exit("create_and_preserve_memfd for token %#x", + MFD3_TOKEN); + } + + close(luo_fd); + daemonize_and_wait(); +} + +/* Stage 2: Executed after the kexec reboot. */ +static void run_stage_2(int luo_fd, int state_session_fd) +{ + int s_empty1_fd, s_empty2_fd, s_files1_fd, s_files2_fd; + int mfd1, mfd2, mfd3, stage; + + ksft_print_msg("[STAGE 2] Starting post-kexec verification...\n"); + + restore_and_read_stage(state_session_fd, STATE_MEMFD_TOKEN, &stage); + if (stage != 2) { + fail_exit("Expected stage 2, but state file contains %d", + stage); + } + + ksft_print_msg("[STAGE 2] Retrieving all sessions...\n"); + s_empty1_fd = luo_retrieve_session(luo_fd, SESSION_EMPTY_1); + if (s_empty1_fd < 0) + fail_exit("luo_retrieve_session for '%s'", SESSION_EMPTY_1); + + s_empty2_fd = luo_retrieve_session(luo_fd, SESSION_EMPTY_2); + if (s_empty2_fd < 0) + fail_exit("luo_retrieve_session for '%s'", SESSION_EMPTY_2); + + s_files1_fd = luo_retrieve_session(luo_fd, SESSION_FILES_1); + if (s_files1_fd < 0) + fail_exit("luo_retrieve_session for '%s'", SESSION_FILES_1); + + s_files2_fd = luo_retrieve_session(luo_fd, SESSION_FILES_2); + if (s_files2_fd < 0) + fail_exit("luo_retrieve_session for '%s'", SESSION_FILES_2); + + ksft_print_msg("[STAGE 2] Verifying contents of session '%s'...\n", + SESSION_FILES_1); + mfd1 = restore_and_verify_memfd(s_files1_fd, MFD1_TOKEN, MFD1_DATA); + if (mfd1 < 0) + fail_exit("restore_and_verify_memfd for token %#x", MFD1_TOKEN); + close(mfd1); + + ksft_print_msg("[STAGE 2] Verifying contents of session '%s'...\n", + SESSION_FILES_2); + + mfd2 = restore_and_verify_memfd(s_files2_fd, MFD2_TOKEN, MFD2_DATA); + if (mfd2 < 0) + fail_exit("restore_and_verify_memfd for token %#x", MFD2_TOKEN); + close(mfd2); + + mfd3 = restore_and_verify_memfd(s_files2_fd, MFD3_TOKEN, MFD3_DATA); + if (mfd3 < 0) + fail_exit("restore_and_verify_memfd for token %#x", MFD3_TOKEN); + close(mfd3); + + ksft_print_msg("[STAGE 2] Test data verified successfully.\n"); + + ksft_print_msg("[STAGE 2] Finalizing all test sessions...\n"); + if (luo_session_finish(s_empty1_fd) < 0) + fail_exit("luo_session_finish for '%s'", SESSION_EMPTY_1); + close(s_empty1_fd); + + if (luo_session_finish(s_empty2_fd) < 0) + fail_exit("luo_session_finish for '%s'", SESSION_EMPTY_2); + close(s_empty2_fd); + + if (luo_session_finish(s_files1_fd) < 0) + fail_exit("luo_session_finish for '%s'", SESSION_FILES_1); + close(s_files1_fd); + + if (luo_session_finish(s_files2_fd) < 0) + fail_exit("luo_session_finish for '%s'", SESSION_FILES_2); + close(s_files2_fd); + + ksft_print_msg("[STAGE 2] Finalizing state session...\n"); + if (luo_session_finish(state_session_fd) < 0) + fail_exit("luo_session_finish for state session"); + close(state_session_fd); + + ksft_print_msg("\n--- MULTI-SESSION KEXEC TEST PASSED ---\n"); +} + +int main(int argc, char *argv[]) +{ + return luo_test(argc, argv, STATE_SESSION_NAME, + run_stage_1, run_stage_2); +} From b15515155af735a86e5bcd67ed739162f0de27f5 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 18 Nov 2025 19:22:16 +0100 Subject: [PATCH 127/139] kho: free chunks using free_page() instead of kfree() Before commit fa759cd75bce5 ("kho: allocate metadata directly from the buddy allocator"), the chunks were allocated from the slab allocator using kzalloc(). Those were rightly freed using kfree(). When the commit switched to using the buddy allocator directly, it missed updating kho_mem_ser_free() to use free_page() instead of kfree(). Link: https://lkml.kernel.org/r/20251118182218.63044-1-pratyush@kernel.org Fixes: fa759cd75bce5 ("kho: allocate metadata directly from the buddy allocator") Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: David Matlack Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 224bdf5becb6..ecc2058df1b6 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -360,7 +360,7 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) struct khoser_mem_chunk *tmp = chunk; chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - kfree(tmp); + free_page((unsigned long)tmp); } } From 11047466eff28cb3d3422622166931204ed7d502 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 18 Nov 2025 19:10:45 +0100 Subject: [PATCH 128/139] test_kho: always print restore status Currently the KHO test only prints a message on success, and remains silent on failure. This makes it difficult to notice a failing test. A failing test is usually more interesting than a successful one. Always print the test status after attempting restore. Link: https://lkml.kernel.org/r/20251118181046.23321-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Alexander Graf Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- lib/test_kho.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/test_kho.c b/lib/test_kho.c index 85b60d87a50a..47de56280795 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -306,7 +306,6 @@ static int kho_test_restore(phys_addr_t fdt_phys) if (err) return err; - pr_info("KHO restore succeeded\n"); return 0; } @@ -319,8 +318,15 @@ static int __init kho_test_init(void) return 0; err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); - if (!err) - return kho_test_restore(fdt_phys); + if (!err) { + err = kho_test_restore(fdt_phys); + if (err) + pr_err("KHO restore failed\n"); + else + pr_info("KHO restore succeeded\n"); + + return err; + } if (err != -ENOENT) { pr_warn("failed to retrieve %s FDT: %d\n", KHO_TEST_FDT, err); From cf4340bdd967fe7f37b4361c85370515f11f4a37 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 18 Nov 2025 17:15:05 +0530 Subject: [PATCH 129/139] kexec: move sysfs entries to /sys/kernel/kexec Patch series "kexec: reorganize kexec and kdump sysfs", v6. All existing kexec and kdump sysfs entries are moved to a new location, /sys/kernel/kexec, to keep /sys/kernel/ clean and better organized. Symlinks are created at the old locations for backward compatibility and can be removed in the future [01/03]. While doing this cleanup, the old kexec and kdump sysfs entries are marked as deprecated in the existing ABI documentation [02/03]. This makes it clear that these older interfaces should no longer be used. New ABI documentation is added to describe the reorganized interfaces [03/03], so users and tools can rely on the updated sysfs interfaces going forward. This patch (of 3): Several kexec and kdump sysfs entries are currently placed directly under /sys/kernel/, which clutters the directory and makes it harder to identify unrelated entries. To improve organization and readability, these entries are now moved under a dedicated directory, /sys/kernel/kexec. The following sysfs moved under new kexec sysfs node +------------------------------------+------------------+ | Old sysfs name | New sysfs name | | (under /sys/kernel) | (under /sys/kernel/kexec) | +---------------------------+---------------------------+ | kexec_loaded | loaded | +---------------------------+---------------------------+ | kexec_crash_loaded | crash_loaded | +---------------------------+---------------------------+ | kexec_crash_size | crash_size | +---------------------------+---------------------------+ | crash_elfcorehdr_size | crash_elfcorehdr_size | +---------------------------+---------------------------+ | kexec_crash_cma_ranges | crash_cma_ranges | +---------------------------+---------------------------+ For backward compatibility, symlinks are created at the old locations so that existing tools and scripts continue to work. These symlinks can be removed in the future once users have switched to the new path. While creating symlinks, entries are added in /sys/kernel/ that point to their new locations under /sys/kernel/kexec/. If an error occurs while adding a symlink, it is logged but does not stop initialization of the remaining kexec sysfs symlinks. The /sys/kernel/ entry is now controlled by CONFIG_CRASH_DUMP instead of CONFIG_VMCORE_INFO, as CONFIG_CRASH_DUMP also enables CONFIG_VMCORE_INFO. Link: https://lkml.kernel.org/r/20251118114507.1769455-1-sourabhjain@linux.ibm.com Link: https://lkml.kernel.org/r/20251118114507.1769455-2-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Aditya Gupta Cc: Dave Young Cc: Hari Bathini Cc: Jiri Bohac Cc: Madhavan Srinivasan Cc: Mahesh J Salgaonkar Cc: Pingfan Liu Cc: Ritesh Harjani (IBM) Cc: Shivang Upadhyay Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 141 ++++++++++++++++++++++++++++++++++++++++++++ kernel/ksysfs.c | 89 +--------------------------- 2 files changed, 142 insertions(+), 88 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c4b4996ff1d1..0f92acdd354d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1225,3 +1226,143 @@ int kernel_kexec(void) kexec_unlock(); return error; } + +static ssize_t loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!kexec_image); +} +static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); + +#ifdef CONFIG_CRASH_DUMP +static ssize_t crash_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); +} +static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); + +#ifdef CONFIG_CRASH_RESERVE +static ssize_t crash_cma_ranges_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + ssize_t len = 0; + int i; + + for (i = 0; i < crashk_cma_cnt; ++i) { + len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", + crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + } + return len; +} +static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); +#endif + +static ssize_t crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sysfs_emit(buf, "%zd\n", size); +} +static ssize_t crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (kstrtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); + +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); + +#endif /* CONFIG_CRASH_HOTPLUG */ +#endif /* CONFIG_CRASH_DUMP */ + +static struct attribute *kexec_attrs[] = { + &loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP + &crash_loaded_attr.attr, + &crash_size_attr.attr, +#ifdef CONFIG_CRASH_RESERVE + &crash_cma_ranges_attr.attr, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif +#endif + NULL +}; + +struct kexec_link_entry { + const char *target; + const char *name; +}; + +static struct kexec_link_entry kexec_links[] = { + { "loaded", "kexec_loaded" }, +#ifdef CONFIG_CRASH_DUMP + { "crash_loaded", "kexec_crash_loaded" }, + { "crash_size", "kexec_crash_size" }, +#ifdef CONFIG_CRASH_RESERVE + {"crash_cma_ranges", "kexec_crash_cma_ranges"}, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, +#endif +#endif +}; + +static struct kobject *kexec_kobj; +ATTRIBUTE_GROUPS(kexec); + +static int __init init_kexec_sysctl(void) +{ + int error; + int i; + + kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); + if (!kexec_kobj) { + pr_err("failed to create kexec kobject\n"); + return -ENOMEM; + } + + error = sysfs_create_groups(kexec_kobj, kexec_groups); + if (error) + goto kset_exit; + + for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { + error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, + kexec_links[i].target, + kexec_links[i].name); + if (error) + pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); + } + + return 0; + +kset_exit: + kobject_put(kexec_kobj); + return error; +} + +subsys_initcall(init_kexec_sysctl); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0ff2179bc603..a9e6354d9e25 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -119,68 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC_CORE -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -#ifdef CONFIG_CRASH_DUMP -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -#ifdef CONFIG_CRASH_RESERVE -static ssize_t kexec_crash_cma_ranges_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - - ssize_t len = 0; - int i; - - for (i = 0; i < crashk_cma_cnt; ++i) { - len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", - crashk_cma_ranges[i].start, - crashk_cma_ranges[i].end); - } - return len; -} -KERNEL_ATTR_RO(kexec_crash_cma_ranges); -#endif /* CONFIG_CRASH_RESERVE */ - -static ssize_t kexec_crash_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - ssize_t size = crash_get_memory_size(); - - if (size < 0) - return size; - - return sysfs_emit(buf, "%zd\n", size); -} -static ssize_t kexec_crash_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long cnt; - int ret; - - if (kstrtoul(buf, 0, &cnt)) - return -EINVAL; - - ret = crash_shrink_memory(cnt); - return ret < 0 ? ret : count; -} -KERNEL_ATTR_RW(kexec_crash_size); - -#endif /* CONFIG_CRASH_DUMP*/ -#endif /* CONFIG_KEXEC_CORE */ - #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, @@ -192,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#ifdef CONFIG_CRASH_HOTPLUG -static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - unsigned int sz = crash_get_elfcorehdr_size(); - - return sysfs_emit(buf, "%u\n", sz); -} -KERNEL_ATTR_RO(crash_elfcorehdr_size); - -#endif - #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ @@ -273,21 +199,8 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC_CORE - &kexec_loaded_attr.attr, -#ifdef CONFIG_CRASH_DUMP - &kexec_crash_loaded_attr.attr, - &kexec_crash_size_attr.attr, -#ifdef CONFIG_CRASH_RESERVE - &kexec_crash_cma_ranges_attr.attr, -#endif -#endif -#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, -#ifdef CONFIG_CRASH_HOTPLUG - &crash_elfcorehdr_size_attr.attr, -#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, From 5c991b6d9b30895744085abb592c77338d65a40d Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 18 Nov 2025 17:15:06 +0530 Subject: [PATCH 130/139] Documentation/ABI: mark old kexec sysfs deprecated The previous commit ("kexec: move sysfs entries to /sys/kernel/kexec") moved all existing kexec sysfs entries to a new location. The ABI document is updated to include a note about the deprecation of the old kexec sysfs entries. The following kexec sysfs entries are deprecated: - /sys/kernel/kexec_loaded - /sys/kernel/kexec_crash_loaded - /sys/kernel/kexec_crash_size - /sys/kernel/crash_elfcorehdr_size - /sys/kernel/kexec_crash_cma_ranges Link: https://lkml.kernel.org/r/20251118114507.1769455-3-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Aditya Gupta Cc: Dave Young Cc: Hari Bathini Cc: Jiri Bohac Cc: Madhavan Srinivasan Cc: Mahesh J Salgaonkar Cc: Pingfan Liu Cc: Ritesh Harjani (IBM) Cc: Shivang Upadhyay Cc: Vivek Goyal Signed-off-by: Andrew Morton --- .../sysfs-kernel-kexec-kdump | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) rename Documentation/ABI/{testing => obsolete}/sysfs-kernel-kexec-kdump (63%) diff --git a/Documentation/ABI/testing/sysfs-kernel-kexec-kdump b/Documentation/ABI/obsolete/sysfs-kernel-kexec-kdump similarity index 63% rename from Documentation/ABI/testing/sysfs-kernel-kexec-kdump rename to Documentation/ABI/obsolete/sysfs-kernel-kexec-kdump index f6089e38de5f..ba26a6a1d2be 100644 --- a/Documentation/ABI/testing/sysfs-kernel-kexec-kdump +++ b/Documentation/ABI/obsolete/sysfs-kernel-kexec-kdump @@ -1,3 +1,21 @@ +NOTE: all the ABIs listed in this file are deprecated and will be removed after 2028. + +Here are the alternative ABIs: ++------------------------------------+-----------------------------------------+ +| Deprecated | Alternative | ++------------------------------------+-----------------------------------------+ +| /sys/kernel/kexec_loaded | /sys/kernel/kexec/loaded | ++------------------------------------+-----------------------------------------+ +| /sys/kernel/kexec_crash_loaded | /sys/kernel/kexec/crash_loaded | ++------------------------------------+-----------------------------------------+ +| /sys/kernel/kexec_crash_size | /sys/kernel/kexec/crash_size | ++------------------------------------+-----------------------------------------+ +| /sys/kernel/crash_elfcorehdr_size | /sys/kernel/kexec/crash_elfcorehdr_size | ++------------------------------------+-----------------------------------------+ +| /sys/kernel/kexec_crash_cma_ranges | /sys/kernel/kexec/crash_cma_ranges | ++------------------------------------+-----------------------------------------+ + + What: /sys/kernel/kexec_loaded Date: Jun 2006 Contact: kexec@lists.infradead.org From fb5c3644278c169c0d03b904b2bdedcaea897df7 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 18 Nov 2025 17:15:07 +0530 Subject: [PATCH 131/139] Documentation/ABI: new kexec and kdump sysfs interface Add an ABI document for following kexec and kdump sysfs interface: - /sys/kernel/kexec/loaded - /sys/kernel/kexec/crash_loaded - /sys/kernel/kexec/crash_size - /sys/kernel/kexec/crash_elfcorehdr_size - /sys/kernel/kexec/crash_cma_ranges Link: https://lkml.kernel.org/r/20251118114507.1769455-4-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Aditya Gupta Cc: Dave Young Cc: Hari Bathini Cc: Jiri Bohac Cc: Madhavan Srinivasan Cc: Mahesh J Salgaonkar Cc: Pingfan Liu Cc: Ritesh Harjani (IBM) Cc: Shivang Upadhyay Cc: Vivek Goyal Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-kexec-kdump | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-kexec-kdump diff --git a/Documentation/ABI/testing/sysfs-kernel-kexec-kdump b/Documentation/ABI/testing/sysfs-kernel-kexec-kdump new file mode 100644 index 000000000000..f59051b5d96d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-kexec-kdump @@ -0,0 +1,61 @@ +What: /sys/kernel/kexec/* +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: + The /sys/kernel/kexec/* directory contains sysfs files + that provide information about the configuration status + of kexec and kdump. + +What: /sys/kernel/kexec/loaded +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: read only + Indicates whether a new kernel image has been loaded + into memory using the kexec system call. It shows 1 if + a kexec image is present and ready to boot, or 0 if none + is loaded. +User: kexec tools, kdump service + +What: /sys/kernel/kexec/crash_loaded +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: read only + Indicates whether a crash (kdump) kernel is currently + loaded into memory. It shows 1 if a crash kernel has been + successfully loaded for panic handling, or 0 if no crash + kernel is present. +User: Kexec tools, Kdump service + +What: /sys/kernel/kexec/crash_size +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: read/write + Shows the amount of memory reserved for loading the crash + (kdump) kernel. It reports the size, in bytes, of the + crash kernel area defined by the crashkernel= parameter. + This interface also allows reducing the crashkernel + reservation by writing a smaller value, and the reclaimed + space is added back to the system RAM. +User: Kdump service + +What: /sys/kernel/kexec/crash_elfcorehdr_size +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: read only + Indicates the preferred size of the memory buffer for the + ELF core header used by the crash (kdump) kernel. It defines + how much space is needed to hold metadata about the crashed + system, including CPU and memory information. This information + is used by the user space utility kexec to support updating the + in-kernel kdump image during hotplug operations. +User: Kexec tools + +What: /sys/kernel/kexec/crash_cma_ranges +Date: Nov 2025 +Contact: kexec@lists.infradead.org +Description: read only + Provides information about the memory ranges reserved from + the Contiguous Memory Allocator (CMA) area that are allocated + to the crash (kdump) kernel. It lists the start and end physical + addresses of CMA regions assigned for crashkernel use. +User: kdump service From 40cd0e8dd283b11aff9628fe7fd810ea7cc53e32 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Sat, 22 Nov 2025 18:29:29 +0000 Subject: [PATCH 132/139] KHO: fix boot failure due to kmemleak access to non-PRESENT pages When booting with debug_pagealloc=on while having: CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT=y CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=n the system fails to boot due to page faults during kmemleak scanning. This occurs because: With debug_pagealloc is enabled, __free_pages() invokes debug_pagealloc_unmap_pages(), clearing the _PAGE_PRESENT bit for freed pages in the kernel page table. KHO scratch areas are allocated from memblock and noted by kmemleak. But these areas don't remain reserved but released later to the page allocator using init_cma_reserved_pageblock(). This causes subsequent kmemleak scans access non-PRESENT pages, leading to fatal page faults. Mark scratch areas with kmemleak_ignore_phys() after they are allocated from memblock to exclude them from kmemleak scanning before they are released to buddy allocator to fix this. [ran.xiaokai@zte.com.cn: add comment] Link: https://lkml.kernel.org/r/20251127122700.103927-1-ranxiaokai627@163.com Link: https://lkml.kernel.org/r/20251122182929.92634-1-ranxiaokai627@163.com Signed-off-by: Ran Xiaokai Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Catalin Marinas Cc: Changyuan Lyu Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index ecc2058df1b6..f9b530606693 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -1369,6 +1370,15 @@ static __init int kho_init(void) unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; unsigned long pfn; + /* + * When debug_pagealloc is enabled, __free_pages() clears the + * corresponding PRESENT bit in the kernel page table. + * Subsequent kmemleak scans of these pages cause the + * non-PRESENT page faults. + * Mark scratch areas with kmemleak_ignore_phys() to exclude + * them from kmemleak scanning. + */ + kmemleak_ignore_phys(kho_scratch[i].addr); for (pfn = base_pfn; pfn < base_pfn + count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); From af06a40474793ad9677d1771c0624ae8191f0892 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 22 Nov 2025 12:45:37 +0100 Subject: [PATCH 133/139] init: replace simple_strtoul with kstrtoul to improve lpj_setup Replace simple_strtoul() with the recommended kstrtoul() for parsing the 'lpj=' boot parameter. Check the return value of kstrtoul() and reject invalid values. This adds error handling while preserving existing behavior for valid values, and removes use of the deprecated simple_strtoul() helper. Link: https://lkml.kernel.org/r/20251122114539.446937-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Cc: Christian Brauner Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- init/calibrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/calibrate.c b/init/calibrate.c index f3831272f113..09c2e6102110 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -14,10 +14,10 @@ unsigned long lpj_fine; unsigned long preset_lpj; + static int __init lpj_setup(char *str) { - preset_lpj = simple_strtoul(str,NULL,0); - return 1; + return kstrtoul(str, 0, &preset_lpj) == 0; } __setup("lpj=", lpj_setup); From c39eab75a57dbd6b4618fae88d36446e1cdffdde Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 25 Nov 2025 18:03:53 +0200 Subject: [PATCH 134/139] MAINTAINERS: TPM DEVICE DRIVER: update the W-tag I migrated test suite to git.kernel.org so that all my kernel stuff is consolidated to one single place: https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd-test.git/about/ Link: https://lkml.kernel.org/r/20251125160353.2300402-1-jarkko@kernel.org Signed-off-by: Jarkko Sakkinen Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 83bac6c48c98..656e29624d4f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26047,7 +26047,7 @@ M: Jarkko Sakkinen R: Jason Gunthorpe L: linux-integrity@vger.kernel.org S: Maintained -W: https://codeberg.org/jarkko/linux-tpmdd-test +W: https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd-test.git/about/ Q: https://patchwork.kernel.org/project/linux-integrity/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git F: Documentation/devicetree/bindings/tpm/ From 4bc84cd539dff1a6346ffeb5f174bb79e238fa78 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 25 Nov 2025 13:09:16 +0200 Subject: [PATCH 135/139] kho: kho_restore_vmalloc: fix initialization of pages array Patch series "kho: fixes for vmalloc restoration". Pratyush reported off-list that when kho_restore_vmalloc() is used to restore a vmalloc_huge() allocation it hits VM_BUG_ON() when we reconstruct the struct pages in kho_restore_pages(). These patches fix the issue. This patch (of 2): In case a preserved vmalloc allocation was using huge pages, all pages in the array of pages added to vm_struct during kho_restore_vmalloc() are wrongly set to the same page. Fix the indexing when assigning pages to that array. Link: https://lkml.kernel.org/r/20251125110917.843744-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20251125110917.843744-2-rppt@kernel.org Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index f9b530606693..096b7db28baf 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1096,7 +1096,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) goto err_free_pages_array; for (int j = 0; j < contig_pages; j++) - pages[idx++] = page; + pages[idx++] = page + j; phys += contig_pages * PAGE_SIZE; } From 7b71205ae1120e90c7f6d41d282e26c00e9ee6a7 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 25 Nov 2025 13:09:17 +0200 Subject: [PATCH 136/139] kho: fix restoring of contiguous ranges of order-0 pages When contiguous ranges of order-0 pages are restored, kho_restore_page() calls prep_compound_page() with the first page in the range and order as parameters and then kho_restore_pages() calls split_page() to make sure all pages in the range are order-0. However, since split_page() is not intended to split compound pages and with VM_DEBUG enabled it will trigger a VM_BUG_ON_PAGE(). Update kho_restore_page() so that it will use prep_compound_page() when it restores a folio and make sure it properly sets page count for both large folios and ranges of order-0 pages. Link: https://lkml.kernel.org/r/20251125110917.843744-3-rppt@kernel.org Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Mike Rapoport (Microsoft) Reported-by: Pratyush Yadav Cc: Alexander Graf Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 096b7db28baf..9dc51fab604f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -219,11 +219,11 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } -static struct page *kho_restore_page(phys_addr_t phys) +static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned int nr_pages, ref_cnt; union kho_page_info info; - unsigned int nr_pages; if (!page) return NULL; @@ -243,11 +243,16 @@ static struct page *kho_restore_page(phys_addr_t phys) /* Head page gets refcount of 1. */ set_page_count(page, 1); - /* For higher order folios, tail pages get a page count of zero. */ + /* + * For higher order folios, tail pages get a page count of zero. + * For physically contiguous order-0 pages every pages gets a page + * count of 1 + */ + ref_cnt = is_folio ? 0 : 1; for (unsigned int i = 1; i < nr_pages; i++) - set_page_count(page + i, 0); + set_page_count(page + i, ref_cnt); - if (info.order > 0) + if (is_folio && info.order) prep_compound_page(page, info.order); adjust_managed_page_count(page, nr_pages); @@ -262,7 +267,7 @@ static struct page *kho_restore_page(phys_addr_t phys) */ struct folio *kho_restore_folio(phys_addr_t phys) { - struct page *page = kho_restore_page(phys); + struct page *page = kho_restore_page(phys, true); return page ? page_folio(page) : NULL; } @@ -287,11 +292,10 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - struct page *page = kho_restore_page(PFN_PHYS(pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn), false); if (!page) return NULL; - split_page(page, order); pfn += 1 << order; } From 3fa805c37dd4d3e72ae5c58800f3f46ab3ca1f70 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Oct 2025 03:36:50 -0700 Subject: [PATCH 137/139] vmcoreinfo: track and log recoverable hardware errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a generic infrastructure for tracking recoverable hardware errors (HW errors that are visible to the OS but does not cause a panic) and record them for vmcore consumption. This aids post-mortem crash analysis tools by preserving a count and timestamp for the last occurrence of such errors. On the other side, correctable errors, which the OS typically remains unaware of because the underlying hardware handles them transparently, are less relevant for crash dump and therefore are NOT tracked in this infrastructure. Add centralized logging for sources of recoverable hardware errors based on the subsystem it has been notified. hwerror_data is write-only at kernel runtime, and it is meant to be read from vmcore using tools like crash/drgn. For example, this is how it looks like when opening the crashdump from drgn. >>> prog['hwerror_data'] (struct hwerror_info[1]){ { .count = (int)844, .timestamp = (time64_t)1752852018, }, ... This helps fleet operators quickly triage whether a crash may be influenced by hardware recoverable errors (which executes a uncommon code path in the kernel), especially when recoverable errors occurred shortly before a panic, such as the bug fixed by commit ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them when destroying the pool") This is not intended to replace full hardware diagnostics but provides a fast way to correlate hardware events with kernel panics quickly. Rare machine check exceptions—like those indicated by mce_flags.p5 or mce_flags.winchip—are not accounted for in this method, as they fall outside the intended usage scope for this feature's user base. [leitao@debian.org: add hw-recoverable-errors to toctree] Link: https://lkml.kernel.org/r/20251127-vmcoreinfo_fix-v1-1-26f5b1c43da9@debian.org Link: https://lkml.kernel.org/r/20251010-vmcore_hw_error-v5-1-636ede3efe44@debian.org Signed-off-by: Breno Leitao Suggested-by: Tony Luck Suggested-by: Shuai Xue Reviewed-by: Shuai Xue Reviewed-by: Hanjun Guo [APEI] Cc: Bjorn Helgaas Cc: Bob Moore Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Konrad Rzessutek Wilk Cc: Len Brown Cc: Mahesh Salgaonkar Cc: Mauro Carvalho Chehab Cc: "Oliver O'Halloran" Cc: Omar Sandoval Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- .../driver-api/hw-recoverable-errors.rst | 60 +++++++++++++++++++ Documentation/driver-api/index.rst | 1 + arch/x86/kernel/cpu/mce/core.c | 4 ++ drivers/acpi/apei/ghes.c | 36 +++++++++++ drivers/pci/pcie/aer.c | 2 + include/linux/vmcore_info.h | 8 +++ include/uapi/linux/vmcore.h | 9 +++ kernel/vmcore_info.c | 17 ++++++ 8 files changed, 137 insertions(+) create mode 100644 Documentation/driver-api/hw-recoverable-errors.rst diff --git a/Documentation/driver-api/hw-recoverable-errors.rst b/Documentation/driver-api/hw-recoverable-errors.rst new file mode 100644 index 000000000000..fc526c3454bd --- /dev/null +++ b/Documentation/driver-api/hw-recoverable-errors.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================= +Recoverable Hardware Error Tracking in vmcoreinfo +================================================= + +Overview +-------- + +This feature provides a generic infrastructure within the Linux kernel to track +and log recoverable hardware errors. These are hardware recoverable errors +visible that might not cause immediate panics but may influence health, mainly +because new code path will be executed in the kernel. + +By recording counts and timestamps of recoverable errors into the vmcoreinfo +crash dump notes, this infrastructure aids post-mortem crash analysis tools in +correlating hardware events with kernel failures. This enables faster triage +and better understanding of root causes, especially in large-scale cloud +environments where hardware issues are common. + +Benefits +-------- + +- Facilitates correlation of hardware recoverable errors with kernel panics or + unusual code paths that lead to system crashes. +- Provides operators and cloud providers quick insights, improving reliability + and reducing troubleshooting time. +- Complements existing full hardware diagnostics without replacing them. + +Data Exposure and Consumption +----------------------------- + +- The tracked error data consists of per-error-type counts and timestamps of + last occurrence. +- This data is stored in the `hwerror_data` array, categorized by error source + types like CPU, memory, PCI, CXL, and others. +- It is exposed via vmcoreinfo crash dump notes and can be read using tools + like `crash`, `drgn`, or other kernel crash analysis utilities. +- There is no other way to read these data other than from crash dumps. +- These errors are divided by area, which includes CPU, Memory, PCI, CXL and + others. + +Typical usage example (in drgn REPL): + +.. code-block:: python + + >>> prog['hwerror_data'] + (struct hwerror_info[HWERR_RECOV_MAX]){ + { + .count = (int)844, + .timestamp = (time64_t)1752852018, + }, + ... + } + +Enabling +-------- + +- This feature is enabled when CONFIG_VMCORE_INFO is set. + diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 3e2a270bd828..a35705b44799 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -96,6 +96,7 @@ Subsystem-specific APIs gpio/index hsi hte/index + hw-recoverable-errors i2c iio/index infiniband diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460e90a1a0b1..08adbf4cd6ed 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -1700,6 +1701,9 @@ noinstr void do_machine_check(struct pt_regs *regs) } out: + /* Given it didn't panic, mark it as recoverable */ + hwerr_log_error_type(HWERR_RECOV_OTHERS); + instrumentation_end(); clear: diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 97ee19f2cae0..92b0e3c391b2 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -867,6 +868,40 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) } EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL"); +static void ghes_log_hwerr(int sev, guid_t *sec_type) +{ + if (sev != CPER_SEV_RECOVERABLE) + return; + + if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) || + guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) || + guid_equal(sec_type, &CPER_SEC_PROC_IA)) { + hwerr_log_error_type(HWERR_RECOV_CPU); + return; + } + + if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) || + guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) || + guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) || + guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) { + hwerr_log_error_type(HWERR_RECOV_CXL); + return; + } + + if (guid_equal(sec_type, &CPER_SEC_PCIE) || + guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) { + hwerr_log_error_type(HWERR_RECOV_PCI); + return; + } + + if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { + hwerr_log_error_type(HWERR_RECOV_MEMORY); + return; + } + + hwerr_log_error_type(HWERR_RECOV_OTHERS); +} + static void ghes_do_proc(struct ghes *ghes, const struct acpi_hest_generic_status *estatus) { @@ -888,6 +923,7 @@ static void ghes_do_proc(struct ghes *ghes, if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) fru_text = gdata->fru_text; + ghes_log_hwerr(sev, sec_type); if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 0b5ed4722ac3..e0bcaa896803 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -765,6 +766,7 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, break; case AER_NONFATAL: aer_info->dev_total_nonfatal_errs++; + hwerr_log_error_type(HWERR_RECOV_PCI); counter = &aer_info->dev_nonfatal_errs[0]; max = AER_MAX_TYPEOF_UNCOR_ERRS; break; diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index 37e003ae5262..e71518caacdf 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -5,6 +5,7 @@ #include #include #include +#include #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) @@ -77,4 +78,11 @@ extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); + +#ifdef CONFIG_VMCORE_INFO +void hwerr_log_error_type(enum hwerr_error_type src); +#else +static inline void hwerr_log_error_type(enum hwerr_error_type src) {}; +#endif + #endif /* LINUX_VMCORE_INFO_H */ diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h index 3e9da91866ff..2ba89fafa518 100644 --- a/include/uapi/linux/vmcore.h +++ b/include/uapi/linux/vmcore.h @@ -15,4 +15,13 @@ struct vmcoredd_header { __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */ }; +enum hwerr_error_type { + HWERR_RECOV_CPU, + HWERR_RECOV_MEMORY, + HWERR_RECOV_PCI, + HWERR_RECOV_CXL, + HWERR_RECOV_OTHERS, + HWERR_RECOV_MAX, +}; + #endif /* _UAPI_VMCORE_H */ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index e066d31d08f8..fe9bf8db1922 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -31,6 +31,13 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +struct hwerr_info { + atomic_t count; + time64_t timestamp; +}; + +static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { @@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +void hwerr_log_error_type(enum hwerr_error_type src) +{ + if (src < 0 || src >= HWERR_RECOV_MAX) + return; + + atomic_inc(&hwerr_data[src].count); + WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); +} +EXPORT_SYMBOL_GPL(hwerr_log_error_type); + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); From 6fb3acdebf65a72df0a95f9fd2c901ff2bc9a3a2 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Mon, 24 Nov 2025 16:53:49 +0000 Subject: [PATCH 138/139] Reinstate "resource: avoid unnecessary lookups in find_next_iomem_res()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 97523a4edb7b ("kernel/resource: remove first_lvl / siblings_only logic") removed an optimization introduced by commit 756398750e11 ("resource: avoid unnecessary lookups in find_next_iomem_res()"). That was not called out in the message of the first commit explicitly so it's not entirely clear whether removing the optimization happened inadvertently or not. As the original commit message of the optimization explains there is no point considering the children of a subtree in find_next_iomem_res() if the top level range does not match. Reinstating the optimization results in performance improvements in systems where /proc/iomem is ~5k lines long. Calling mmap() on /dev/mem in such platforms takes 700-1500μs without the optimisation and 10-50μs with the optimisation. Note that even though commit 97523a4edb7b removed the 'sibling_only' parameter from next_resource(), newer kernels have basically reinstated it under the name 'skip_children'. Link: https://lore.kernel.org/all/20251124165349.3377826-1-ilstam@amazon.com/T/#u Fixes: 97523a4edb7b ("kernel/resource: remove first_lvl / siblings_only logic") Signed-off-by: Ilias Stamatis Acked-by: David Hildenbrand (Red Hat) Cc: Andriy Shevchenko Cc: Baoquan He Cc: "Huang, Ying" Cc: Nadav Amit Signed-off-by: Andrew Morton --- kernel/resource.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index b9fa2a4ce089..e4e9bac12e6e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -341,6 +341,8 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, struct resource *res) { + /* Skip children until we find a top level range that matches */ + bool skip_children = true; struct resource *p; if (!res) @@ -351,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, false) { + for_each_resource(&iomem_resource, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -362,6 +364,12 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; + /* + * We found a top level range that matches what we are looking + * for. Time to start checking children too. + */ + skip_children = false; + /* Found a match, break */ if (is_type_match(p, flags, desc)) break; From aa514a297a0c175239f24a2e582ebd37f0727494 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 25 Nov 2025 00:06:06 +0100 Subject: [PATCH 139/139] calibrate: update header inclusion While cleaning up some headers, I got a build error on this file: init/calibrate.c:20:9: error: call to undeclared function 'kstrtoul'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] Update header inclusions to follow IWYU (Include What You Use) principle. Link: https://lkml.kernel.org/r/20251124230607.1445421-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton --- init/calibrate.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/init/calibrate.c b/init/calibrate.c index 09c2e6102110..63be4c65bc52 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -5,12 +5,15 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include #include #include -#include -#include +#include +#include #include +#include +#include +#include +#include unsigned long lpj_fine; unsigned long preset_lpj;