From ce3236a3c7d8e048e0bcc7f445f12f911dd9dc7d Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Fri, 10 Oct 2025 17:52:57 +0800 Subject: [PATCH 01/58] ext4: make error code in __ext4fs_dirhash() consistent. Currently __ext4fs_dirhash() returns -1 (-EPERM) if fscrypt doesn't have encryption key, which may confuse users. Make the error code here consistent with existing error code. Signed-off-by: Julian Sun Message-ID: <20251010095257.3008275-1-sunjunchao@bytedance.com> Signed-off-by: Theodore Ts'o --- fs/ext4/hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 33cd5b6b02d5..48483cd015d3 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -268,7 +268,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, combined_hash = fscrypt_fname_siphash(dir, &qname); } else { ext4_warning_inode(dir, "Siphash requires key"); - return -1; + return -EINVAL; } hash = (__u32)(combined_hash >> 32); From 6640d552185f7c11235c64a832004db9af119b2d Mon Sep 17 00:00:00 2001 From: Ranganath V N Date: Sat, 11 Oct 2025 12:08:29 +0530 Subject: [PATCH 02/58] fs: ext4: fix uninitialized symbols Fix the issue detected by the smatch tool. fs/ext4/inode.c:3583 ext4_map_blocks_atomic_write_slow() error: uninitialized symbol 'next_pblk'. fs/ext4/namei.c:1776 ext4_lookup() error: uninitialized symbol 'de'. fs/ext4/namei.c:1829 ext4_get_parent() error: uninitialized symbol 'de'. fs/ext4/namei.c:3162 ext4_rmdir() error: uninitialized symbol 'de'. fs/ext4/namei.c:3242 __ext4_unlink() error: uninitialized symbol 'de'. fs/ext4/namei.c:3697 ext4_find_delete_entry() error: uninitialized symbol 'de'. These changes enhance code clarity, address static analysis tool errors. Signed-off-by: Ranganath V N Message-ID: <20251011063830.47485-1-vnranganath.20@gmail.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- fs/ext4/namei.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e99306a8f47c..6356340b768d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3544,7 +3544,7 @@ static int ext4_map_blocks_atomic_write_slow(handle_t *handle, ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; unsigned int mapped_len = 0, m_flags = 0; - ext4_fsblk_t next_pblk; + ext4_fsblk_t next_pblk = 0; bool check_next_pblk = false; int ret = 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2cd36f59c9e3..045616033515 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1762,7 +1762,7 @@ success: static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) @@ -1818,7 +1818,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - struct ext4_dir_entry_2 * de; + struct ext4_dir_entry_2 * de = NULL; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); @@ -3133,7 +3133,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) int retval; struct inode *inode; struct buffer_head *bh; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; handle_t *handle = NULL; retval = ext4_emergency_state(dir->i_sb); @@ -3224,7 +3224,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, { int retval = -ENOENT; struct buffer_head *bh; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; handle_t *handle; int skip_remove_dentry = 0; @@ -3688,7 +3688,7 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, { int retval = -ENOENT; struct buffer_head *bh; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) From a2e5a3cea4b18f6e2575acc444a5e8cce1fc8260 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:17 +0800 Subject: [PATCH 03/58] ext4: correct the checking of quota files before moving extents The move extent operation should return -EOPNOTSUPP if any of the inodes is a quota inode, rather than requiring both to be quota inodes. Fixes: 02749a4c2082 ("ext4: add ext4_is_quota_file()") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-2-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4b091c21908f..0f4b7c89edd3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -485,7 +485,7 @@ mext_check_arguments(struct inode *orig_inode, return -ETXTBSY; } - if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { + if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; From dd064d5101ea473d39c39ffaa8beeb8f47bbeb09 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:18 +0800 Subject: [PATCH 04/58] ext4: introduce seq counter for the extent status entry In the iomap_write_iter(), the iomap buffered write frame does not hold any locks between querying the inode extent mapping info and performing page cache writes. As a result, the extent mapping can be changed due to concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the write-back process faces a similar problem: concurrent changes can invalidate the extent mapping before the I/O is submitted. Therefore, both of these processes must recheck the mapping info after acquiring the folio lock. To address this, similar to XFS, we propose introducing an extent sequence number to serve as a validity cookie for the extent. After commit 24b7a2331fcd ("ext4: clairfy the rules for modifying extents"), we can ensure the extent information should always be processed through the extent status tree, and the extent status tree is always uptodate under i_rwsem or invalidate_lock or folio lock, so it's safe to introduce this sequence number. The sequence number will be increased whenever the extent status tree changes, preparing for the buffered write iomap conversion. Besides, this mechanism is also applicable for the moving extents case. In move_extent_per_page(), it also needs to reacquire data_sem and check the mapping info again under the folio lock. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-3-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents_status.c | 25 +++++++++++++++++++++---- fs/ext4/super.c | 1 + include/trace/events/ext4.h | 23 +++++++++++++++-------- 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 57087da6c7be..eff97b3a1093 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1138,6 +1138,8 @@ struct ext4_inode_info { ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ + u64 i_es_seq; /* Change counter for extents. + Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 31dc0496f8d0..c3daa57ecd35 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -235,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es) return es->es_lblk + es->es_len - 1; } +static inline void ext4_es_inc_seq(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -906,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); - trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); @@ -955,6 +961,11 @@ retry: } pending = err3; } + /* + * TODO: For cache on-disk extents, there is no need to increment + * the sequence counter, this requires future optimization. + */ + ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); /* @@ -981,6 +992,7 @@ error: if (err1 || err2 || err3 < 0) goto retry; + trace_ext4_es_insert_extent(inode, &newes); ext4_es_print_tree(inode); return; } @@ -1550,7 +1562,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); @@ -1570,16 +1581,21 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); + if (err) + goto error; /* Free preallocated extent if it didn't get used. */ if (es) { if (!es->es_len) __es_free_extent(es); es = NULL; } + ext4_es_inc_seq(inode); +error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; + trace_ext4_es_remove_extent(inode, lblk, len); ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); } @@ -2140,8 +2156,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); - trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, - end_allocated); ext4_es_insert_extent_check(inode, &newes); @@ -2196,11 +2210,14 @@ retry: pr2 = NULL; } } + ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2 || err3 < 0) goto retry; + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, + end_allocated); ext4_es_print_tree(inode); ext4_print_pending_tree(inode); return; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 33e7c08c9529..760c9d7588be 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1406,6 +1406,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; + ei->i_es_seq = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index a374e7ea7e57..6a0754d38acf 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2210,7 +2210,8 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) - __field( char, status ) + __field( char, status ) + __field( u64, seq ) ), TP_fast_assign( @@ -2220,13 +2221,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, - __entry->pblk, show_extent_status(__entry->status)) + __entry->pblk, show_extent_status(__entry->status), + __entry->seq) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, @@ -2251,6 +2254,7 @@ TRACE_EVENT(ext4_es_remove_extent, __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) + __field( u64, seq ) ), TP_fast_assign( @@ -2258,12 +2262,13 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->lblk, __entry->len) + __entry->lblk, __entry->len, __entry->seq) ); TRACE_EVENT(ext4_es_find_extent_range_enter, @@ -2523,6 +2528,7 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, __field( char, status ) __field( bool, lclu_allocated ) __field( bool, end_allocated ) + __field( u64, seq ) ), TP_fast_assign( @@ -2534,15 +2540,16 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, __entry->status = ext4_es_status(es); __entry->lclu_allocated = lclu_allocated; __entry->end_allocated = end_allocated; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " - "allocated %d %d", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), - __entry->lclu_allocated, __entry->end_allocated) + __entry->lclu_allocated, __entry->end_allocated, + __entry->seq) ); /* fsmap traces */ From 7da5565cab4069b2b171dbfa7554b596a7fdf827 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:19 +0800 Subject: [PATCH 05/58] ext4: make ext4_es_lookup_extent() pass out the extent seq counter When querying extents in the extent status tree, we should hold the data_sem if we want to obtain the sequence number as a valid cookie simultaneously. However, currently, ext4_map_blocks() calls ext4_es_lookup_extent() without holding data_sem. Therefore, we should acquire i_es_lock instead, which also ensures that the sequence cookie and the extent remain consistent. Consequently, make ext4_es_lookup_extent() to pass out the sequence number when necessary. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-4-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- fs/ext4/extents_status.c | 6 ++++-- fs/ext4/extents_status.h | 2 +- fs/ext4/inode.c | 8 ++++---- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ca5499e9412b..c7d219e6c6d8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2213,7 +2213,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, while (block <= end) { next = 0; flags = 0; - if (!ext4_es_lookup_extent(inode, block, &next, &es)) + if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c3daa57ecd35..e04fbf10fe4f 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1039,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t *next_lblk, - struct extent_status *es) + ext4_lblk_t *next_lblk, struct extent_status *es, + u64 *pseq) { struct ext4_es_tree *tree; struct ext4_es_stats *stats; @@ -1099,6 +1099,8 @@ out: } else *next_lblk = 0; } + if (pseq) + *pseq = EXT4_I(inode)->i_es_seq; } else { percpu_counter_inc(&stats->es_stats_cache_misses); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8f9c008d11e8..f3396cf32b44 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -148,7 +148,7 @@ extern void ext4_es_find_extent_range(struct inode *inode, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, - struct extent_status *es); + struct extent_status *es, u64 *pseq); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6356340b768d..b62c1a87ed6b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -649,7 +649,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { if (ext4_es_is_written(&es)) return retval; } @@ -723,7 +723,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -1908,7 +1908,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); @@ -1961,7 +1961,7 @@ add_delayed: * is held in write mode, before inserting a new da entry in * the extent status tree. */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); From 07c440e8da8fee5b3512a5742ddc71776a0041ac Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:20 +0800 Subject: [PATCH 06/58] ext4: pass out extent seq counter when mapping blocks When creating or querying mapping blocks using the ext4_map_blocks() and ext4_map_{query|create}_blocks() helpers, also pass out the extent sequence number of the block mapping info through the ext4_map_blocks structure. This sequence number can later serve as a valid cookie within iomap infrastructure and the move extents procedure. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-5-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index eff97b3a1093..9f127aedbaee 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -260,6 +260,7 @@ struct ext4_map_blocks { ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; + u64 m_seq; }; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b62c1a87ed6b..783c883d4d5e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -550,10 +550,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval <= 0) + if (retval < 0) return retval; + /* A hole? */ + if (retval == 0) + goto out; + if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " @@ -573,11 +576,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); - return retval; + } else { + retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } - - return ext4_map_query_blocks_next_in_leaf(handle, inode, map, - orig_mlen); +out: + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); + return retval; } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -649,7 +654,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es)) return retval; } @@ -658,6 +663,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); return retval; } @@ -723,7 +729,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -1979,6 +1985,8 @@ add_delayed: map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); + if (!retval) + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); up_write(&EXT4_I(inode)->i_data_sem); return retval; From c9570b6634243cc7a55307dffd1965a3b8798591 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:21 +0800 Subject: [PATCH 07/58] ext4: use EXT4_B_TO_LBLK() in mext_check_arguments() Switch to using EXT4_B_TO_LBLK() to calculate the EOF position of the origin and donor inodes, instead of using open-coded calculations. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-6-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 0f4b7c89edd3..6175906c7119 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -461,12 +461,6 @@ mext_check_arguments(struct inode *orig_inode, __u64 donor_start, __u64 *len) { __u64 orig_eof, donor_eof; - unsigned int blkbits = orig_inode->i_blkbits; - unsigned int blocksize = 1 << blkbits; - - orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; - donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; - if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" @@ -526,6 +520,9 @@ mext_check_arguments(struct inode *orig_inode, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } + + orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode)); + donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode)); if (orig_eof <= orig_start) *len = 0; else if (orig_eof < orig_start + *len - 1) From 22218516e462d59b27ffcfc9dd75d4f98e482c51 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:22 +0800 Subject: [PATCH 08/58] ext4: add mext_check_validity() to do basic check Currently, the basic validation checks during the move extent operation are scattered across __ext4_ioctl() and ext4_move_extents(), which makes the code somewhat disorganized. Introduce a new helper, mext_check_validity(), to handle these checks. This change involves only code relocation without any logical modifications. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-7-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 10 ----- fs/ext4/move_extent.c | 102 +++++++++++++++++++++++++++--------------- 2 files changed, 65 insertions(+), 47 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a93a7baae990..366a9b615bf0 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1641,16 +1641,6 @@ group_extend_out: if (!(fd_file(donor)->f_mode & FMODE_WRITE)) return -EBADF; - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online defrag not supported with bigalloc"); - return -EOPNOTSUPP; - } else if (IS_DAX(inode)) { - ext4_msg(sb, KERN_ERR, - "Online defrag not supported with DAX"); - return -EOPNOTSUPP; - } - err = mnt_want_write_file(filp); if (err) return err; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 6175906c7119..cdd175d5c1f3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -442,6 +442,68 @@ repair_branches: goto unlock_folios; } +/* + * Check the validity of the basic filesystem environment and the + * inodes' support status. + */ +static int mext_check_validity(struct inode *orig_inode, + struct inode *donor_inode) +{ + struct super_block *sb = orig_inode->i_sb; + + /* origin and donor should be different inodes */ + if (orig_inode == donor_inode) { + ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* origin and donor should belone to the same filesystem */ + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (IS_DAX(orig_inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + return -EOPNOTSUPP; + } + + /* + * TODO: it's not obvious how to swap blocks for inodes with full + * journaling enabled. + */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; + } + + if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + + return 0; +} + /** * mext_check_arguments - Check whether move extent can be done * @@ -567,43 +629,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, ext4_lblk_t d_start = donor_blk; int ret; - if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files " - "should be in same FS [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* orig and donor should be different inodes */ - if (orig_inode == donor_inode) { - ext4_debug("ext4 move extent: The argument files should not " - "be same inode [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* Regular file check */ - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be " - "regular file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* TODO: it's not obvious how to swap blocks for inodes with full - journaling enabled */ - if (ext4_should_journal_data(orig_inode) || - ext4_should_journal_data(donor_inode)) { - ext4_msg(orig_inode->i_sb, KERN_ERR, - "Online defrag not supported with data journaling"); - return -EOPNOTSUPP; - } - - if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { - ext4_msg(orig_inode->i_sb, KERN_ERR, - "Online defrag not supported for encrypted files"); - return -EOPNOTSUPP; - } + ret = mext_check_validity(orig_inode, donor_inode); + if (ret) + return ret; /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); From 57c1df07f1ac2668a4e65796565adcbc6995f86c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:23 +0800 Subject: [PATCH 09/58] ext4: refactor mext_check_arguments() When moving extents, mext_check_validity() performs some basic file system and file checks. However, some essential checks need to be performed after acquiring the i_rwsem are still scattered in mext_check_arguments(). Move those checks into mext_check_validity() and make it executes entirely under the i_rwsem to make the checks clearer. Furthermore, rename mext_check_arguments() to mext_check_adjust_range(), as it only performs checks and length adjustments on the move extent range. Finally, also change the print message for the non-existent file check to be consistent with other unsupported checks. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-8-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 97 +++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 54 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index cdd175d5c1f3..0191a3c746db 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -501,60 +501,36 @@ static int mext_check_validity(struct inode *orig_inode, return -EOPNOTSUPP; } - return 0; -} - -/** - * mext_check_arguments - Check whether move extent can be done - * - * @orig_inode: original inode - * @donor_inode: donor inode - * @orig_start: logical start offset in block for orig - * @donor_start: logical start offset in block for donor - * @len: the number of blocks to be moved - * - * Check the arguments of ext4_move_extents() whether the files can be - * exchanged with each other. - * Return 0 on success, or a negative error value on failure. - */ -static int -mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len) -{ - __u64 orig_eof, donor_eof; + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) || + !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported for non-extent files"); + return -EOPNOTSUPP; + } if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { - ext4_debug("ext4 move extent: suid or sgid is set" - " to donor file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) { + ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); return -EPERM; + } /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); + orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EOPNOTSUPP; - } - - /* Ext4 move extent supports only extent based file */ - if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { - ext4_debug("ext4 move extent: orig file is not extents " - "based file [ino:orig %lu]\n", orig_inode->i_ino); - return -EOPNOTSUPP; - } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { - ext4_debug("ext4 move extent: donor file is not extents " - "based file [ino:donor %lu]\n", donor_inode->i_ino); + orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } @@ -563,12 +539,25 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + return 0; +} + +/* + * Check the moving range of ext4_move_extents() whether the files can be + * exchanged with each other, and adjust the length to fit within the file + * size. Return 0 on success, or a negative error value on failure. + */ +static int mext_check_adjust_range(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + __u64 orig_eof, donor_eof; + /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { - ext4_debug("ext4 move extent: orig and donor's start " - "offsets are not aligned [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); + ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -577,9 +566,9 @@ mext_check_arguments(struct inode *orig_inode, (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { - ext4_debug("ext4 move extent: Can't handle over [%u] blocks " - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, - orig_inode->i_ino, donor_inode->i_ino); + ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n", + EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -594,9 +583,8 @@ mext_check_arguments(struct inode *orig_inode, else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { - ext4_debug("ext4 move extent: len should not be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); + ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -629,22 +617,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, ext4_lblk_t d_start = donor_blk; int ret; - ret = mext_check_validity(orig_inode, donor_inode); - if (ret) - return ret; - /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); + ret = mext_check_validity(orig_inode, donor_inode); + if (ret) + goto unlock; + /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); - /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, - donor_blk, &len); + /* Check and adjust the specified move_extent range. */ + ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk, + donor_blk, &len); if (ret) goto out; o_end = o_start + len; @@ -725,6 +713,7 @@ out: ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); +unlock: unlock_two_nondirectories(orig_inode, donor_inode); return ret; From 37cb211f97f8a0d30d7195d6c427f3233fa0271f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:24 +0800 Subject: [PATCH 10/58] ext4: rename mext_page_mkuptodate() to mext_folio_mkuptodate() mext_page_mkuptodate() no longer works on a single page, so rename it to mext_folio_mkuptodate(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-9-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 0191a3c746db..2df6072b26c0 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -165,7 +165,7 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, } /* Force folio buffers uptodate w/o dropping folio's lock */ -static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) +static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to) { struct inode *inode = folio->mapping->host; sector_t block; @@ -358,7 +358,7 @@ again: data_copy: from = offset_in_folio(folio[0], orig_blk_offset << orig_inode->i_blkbits); - *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); + *err = mext_folio_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; From 962e8a01eab95597bb571672f59ab2ec9fec342a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:25 +0800 Subject: [PATCH 11/58] ext4: introduce mext_move_extent() When moving extents, the current move_extent_per_page() process can only move extents of length PAGE_SIZE at a time, which is highly inefficient, especially when the fragmentation of the file is not particularly severe, this will result in a large number of unnecessary extent split and merge operations. Moreover, since the ext4 file system now supports large folios, using PAGE_SIZE as the processing unit is no longer practical. Therefore, introduce a new move extents method, mext_move_extent(). It moves one extent of the origin inode at a time, but not exceeding the size of a folio. The parameters for the move are passed through the new mext_data data structure, which includes the origin inode, donor inode, the mapping extent of the origin inode to be moved, and the starting offset of the donor inode. The move process is similar to move_extent_per_page() and can be categorized into three types: MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, and MEXT_COPY_DATA. MEXT_SKIP_EXTENT indicates that the corresponding area of the donor file is a hole, meaning no actual space is allocated, so the move is skipped. MEXT_MOVE_EXTENT indicates that the corresponding areas of both the origin and donor files are unwritten, so no data needs to be copied; only the extents are swapped. MEXT_COPY_DATA indicates that the corresponding areas of both the origin and donor files contain data, so data must be copied. The data copying is performed in three steps: first, the data from the original location is read into the page cache; then, the extents are swapped, and the page cache is rebuilt to reflect the index of the physical blocks; finally, the dirty page cache is marked and written back to ensure that the data is written to disk before the metadata is persisted. One important point to note is that the folio lock and i_data_sem are held only during the moving process. Therefore, before moving an extent, it is necessary to check whether the sequence cookie of the area to be moved has changed while holding the folio lock. If a change is detected, it indicates that concurrent write-back operations may have occurred during this period, and the type of the extent to be moved can no longer be considered reliable. For example, it may have changed from unwritten to written. In such cases, return -ESTALE, and the calling function should reacquire the move extent of the original file and retry the movement. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-10-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 224 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2df6072b26c0..92a716c56740 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -13,6 +13,13 @@ #include "ext4.h" #include "ext4_extents.h" +struct mext_data { + struct inode *orig_inode; /* Origin file inode */ + struct inode *donor_inode; /* Donor file inode */ + struct ext4_map_blocks orig_map;/* Origin file's move mapping */ + ext4_lblk_t donor_lblk; /* Start block of the donor file */ +}; + /** * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched @@ -164,6 +171,14 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, return 0; } +static void mext_folio_double_unlock(struct folio *folio[2]) +{ + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); +} + /* Force folio buffers uptodate w/o dropping folio's lock */ static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to) { @@ -238,6 +253,215 @@ out: return 0; } +enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA}; + +/* + * Start to move extent between the origin inode and the donor inode, + * hold one folio for each inode and check the candidate moving extent + * mapping status again. + */ +static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], + enum mext_move_type *move_type) +{ + struct inode *orig_inode = mext->orig_inode; + struct inode *donor_inode = mext->donor_inode; + unsigned int blkbits = orig_inode->i_blkbits; + struct ext4_map_blocks donor_map = {0}; + loff_t orig_pos, donor_pos; + size_t move_len; + int ret; + + orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits; + donor_pos = ((loff_t)mext->donor_lblk) << blkbits; + ret = mext_folio_double_lock(orig_inode, donor_inode, + orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT, folio); + if (ret) + return ret; + + /* + * Check the origin inode's mapping information again under the + * folio lock, as we do not hold the i_data_sem at all times, and + * it may change during the concurrent write-back operation. + */ + if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) { + ret = -ESTALE; + goto error; + } + + /* Adjust the moving length according to the length of shorter folio. */ + move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, + folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); + move_len >>= blkbits; + if (move_len < mext->orig_map.m_len) + mext->orig_map.m_len = move_len; + + donor_map.m_lblk = mext->donor_lblk; + donor_map.m_len = mext->orig_map.m_len; + donor_map.m_flags = 0; + ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0); + if (ret < 0) + goto error; + + /* Adjust the moving length according to the donor mapping length. */ + mext->orig_map.m_len = donor_map.m_len; + + /* Skip moving if the donor range is a hole or a delalloc extent. */ + if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) + *move_type = MEXT_SKIP_EXTENT; + /* If both mapping ranges are unwritten, no need to copy data. */ + else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) && + (donor_map.m_flags & EXT4_MAP_UNWRITTEN)) + *move_type = MEXT_MOVE_EXTENT; + else + *move_type = MEXT_COPY_DATA; + + return 0; +error: + mext_folio_double_unlock(folio); + return ret; +} + +/* + * Re-create the new moved mapping buffers of the original inode and commit + * the entire written range. + */ +static int mext_folio_mkwrite(struct inode *inode, struct folio *folio, + size_t from, size_t to) +{ + unsigned int blocksize = i_blocksize(inode); + struct buffer_head *bh, *head; + size_t block_start, block_end; + sector_t block; + int ret; + + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + + block = folio_pos(folio) >> inode->i_blkbits; + block_end = 0; + bh = head; + do { + block_start = block_end; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) + continue; + + ret = ext4_get_block(inode, block, bh, 0); + if (ret) + return ret; + } while (block++, (bh = bh->b_this_page) != head); + + block_commit_write(folio, from, to); + return 0; +} + +/* + * Save the data in original inode extent blocks and replace one folio size + * aligned original inode extent with one or one partial donor inode extent, + * and then write out the saved data in new original inode blocks. Pass out + * the replaced block count through m_len. Return 0 on success, and an error + * code otherwise. + */ +static __used int mext_move_extent(struct mext_data *mext, u64 *m_len) +{ + struct inode *orig_inode = mext->orig_inode; + struct inode *donor_inode = mext->donor_inode; + struct ext4_map_blocks *orig_map = &mext->orig_map; + unsigned int blkbits = orig_inode->i_blkbits; + struct folio *folio[2] = {NULL, NULL}; + loff_t from, length; + enum mext_move_type move_type = 0; + handle_t *handle; + u64 r_len = 0; + unsigned int credits; + int ret, ret2; + + *m_len = 0; + credits = ext4_chunk_trans_extent(orig_inode, 0) * 2; + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = mext_move_begin(mext, folio, &move_type); + if (ret) + goto stop_handle; + + if (move_type == MEXT_SKIP_EXTENT) + goto unlock; + + /* + * Copy the data. First, read the original inode data into the page + * cache. Then, release the existing mapping relationships and swap + * the extent. Finally, re-establish the new mapping relationships + * and dirty the page cache. + */ + if (move_type == MEXT_COPY_DATA) { + from = offset_in_folio(folio[0], + ((loff_t)orig_map->m_lblk) << blkbits); + length = ((loff_t)orig_map->m_len) << blkbits; + + ret = mext_folio_mkuptodate(folio[0], from, from + length); + if (ret) + goto unlock; + } + + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { + ret = -EBUSY; + goto unlock; + } + + /* Move extent */ + ext4_double_down_write_data_sem(orig_inode, donor_inode); + *m_len = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_map->m_lblk, mext->donor_lblk, + orig_map->m_len, 1, &ret); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + + /* A short-length swap cannot occur after a successful swap extent. */ + if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len))) + ret = -EIO; + + if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT)) + goto unlock; + + /* Copy data */ + length = (*m_len) << blkbits; + ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length); + if (ret2) { + if (!ret) + ret = ret2; + goto repair_branches; + } + /* + * Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss. + */ + ret2 = ext4_jbd2_inode_add_write(handle, orig_inode, + ((loff_t)orig_map->m_lblk) << blkbits, length); + if (!ret) + ret = ret2; +unlock: + mext_folio_double_unlock(folio); +stop_handle: + ext4_journal_stop(handle); + return ret; + +repair_branches: + ret2 = 0; + r_len = ext4_swap_extents(handle, donor_inode, orig_inode, + mext->donor_lblk, orig_map->m_lblk, + *m_len, 0, &ret2); + if (ret2 || r_len != *m_len) { + ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk), + EIO, "Unable to copy data block, data will be lost!"); + ret = -EIO; + } + *m_len = 0; + goto unlock; +} + /** * move_extent_per_page - Move extent data per page * From 4589c4518f7c19e49ae2ba4767a86db881017793 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:26 +0800 Subject: [PATCH 12/58] ext4: switch to using the new extent movement method Now that we have mext_move_extent(), we can switch to this new interface and deprecate move_extent_per_page(). First, after acquiring the i_rwsem, we can directly use ext4_map_blocks() to obtain a contiguous extent from the original inode as the extent to be moved. It can and it's safe to get mapping information from the extent status tree without needing to access the ondisk extent tree, because ext4_move_extent() will check the sequence cookie under the folio lock. Then, after populating the mext_data structure, we call ext4_move_extent() to move the extent. Finally, the length of the extent will be adjusted in mext.orig_map.m_len and the actual length moved is returned through m_len. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-11-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 395 ++++++------------------------------------ 1 file changed, 51 insertions(+), 344 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 92a716c56740..933c2afed550 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -20,29 +20,6 @@ struct mext_data { ext4_lblk_t donor_lblk; /* Start block of the donor file */ }; -/** - * get_ext_path() - Find an extent path for designated logical block number. - * @inode: inode to be searched - * @lblock: logical block number to find an extent path - * @path: pointer to an extent path - * - * ext4_find_extent wrapper. Return an extent path pointer on success, - * or an error pointer on failure. - */ -static inline struct ext4_ext_path * -get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path *path) -{ - path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); - if (IS_ERR(path)) - return path; - if (path[ext_depth(inode)].p_ext == NULL) { - ext4_free_ext_path(path); - return ERR_PTR(-ENODATA); - } - return path; -} - /** * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem * @first: inode to be locked @@ -59,7 +36,6 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second) } else { down_write(&EXT4_I(second)->i_data_sem); down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); - } } @@ -78,42 +54,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode, up_write(&EXT4_I(donor_inode)->i_data_sem); } -/** - * mext_check_coverage - Check that all extents in range has the same type - * - * @inode: inode in question - * @from: block offset of inode - * @count: block count to be checked - * @unwritten: extents expected to be unwritten - * @err: pointer to save error value - * - * Return 1 if all extents in range has expected type, and zero otherwise. - */ -static int -mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, - int unwritten, int *err) -{ - struct ext4_ext_path *path = NULL; - struct ext4_extent *ext; - int ret = 0; - ext4_lblk_t last = from + count; - while (from < last) { - path = get_ext_path(inode, from, path); - if (IS_ERR(path)) { - *err = PTR_ERR(path); - return ret; - } - ext = path[ext_depth(inode)].p_ext; - if (unwritten != ext4_ext_is_unwritten(ext)) - goto out; - from += ext4_ext_get_actual_len(ext); - } - ret = 1; -out: - ext4_free_ext_path(path); - return ret; -} - /** * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * @@ -363,7 +303,7 @@ static int mext_folio_mkwrite(struct inode *inode, struct folio *folio, * the replaced block count through m_len. Return 0 on success, and an error * code otherwise. */ -static __used int mext_move_extent(struct mext_data *mext, u64 *m_len) +static int mext_move_extent(struct mext_data *mext, u64 *m_len) { struct inode *orig_inode = mext->orig_inode; struct inode *donor_inode = mext->donor_inode; @@ -462,210 +402,6 @@ repair_branches: goto unlock; } -/** - * move_extent_per_page - Move extent data per page - * - * @o_filp: file structure of original file - * @donor_inode: donor inode - * @orig_page_offset: page index on original file - * @donor_page_offset: page index on donor file - * @data_offset_in_page: block index where data swapping starts - * @block_len_in_page: the number of blocks to be swapped - * @unwritten: orig extent is unwritten or not - * @err: pointer to save return value - * - * Save the data in original inode blocks and replace original inode extents - * with donor inode extents by calling ext4_swap_extents(). - * Finally, write out the saved data in new original inode blocks. Return - * replaced block count. - */ -static int -move_extent_per_page(struct file *o_filp, struct inode *donor_inode, - pgoff_t orig_page_offset, pgoff_t donor_page_offset, - int data_offset_in_page, - int block_len_in_page, int unwritten, int *err) -{ - struct inode *orig_inode = file_inode(o_filp); - struct folio *folio[2] = {NULL, NULL}; - handle_t *handle; - ext4_lblk_t orig_blk_offset, donor_blk_offset; - unsigned long blocksize = orig_inode->i_sb->s_blocksize; - unsigned int tmp_data_size, data_size, replaced_size; - int i, err2, jblocks, retries = 0; - int replaced_count = 0; - int from; - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; - struct super_block *sb = orig_inode->i_sb; - struct buffer_head *bh = NULL; - - /* - * It needs twice the amount of ordinary journal buffers because - * inode and donor_inode may change each different metadata blocks. - */ -again: - *err = 0; - jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, - block_len_in_page) * 2; - handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); - if (IS_ERR(handle)) { - *err = PTR_ERR(handle); - return 0; - } - - orig_blk_offset = orig_page_offset * blocks_per_page + - data_offset_in_page; - - donor_blk_offset = donor_page_offset * blocks_per_page + - data_offset_in_page; - - /* Calculate data_size */ - if ((orig_blk_offset + block_len_in_page - 1) == - ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { - /* Replace the last block */ - tmp_data_size = orig_inode->i_size & (blocksize - 1); - /* - * If data_size equal zero, it shows data_size is multiples of - * blocksize. So we set appropriate value. - */ - if (tmp_data_size == 0) - tmp_data_size = blocksize; - - data_size = tmp_data_size + - ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else - data_size = block_len_in_page << orig_inode->i_blkbits; - - replaced_size = data_size; - - *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, - donor_page_offset, folio); - if (unlikely(*err < 0)) - goto stop_journal; - /* - * If orig extent was unwritten it can become initialized - * at any time after i_data_sem was dropped, in order to - * serialize with delalloc we have recheck extent while we - * hold page's lock, if it is still the case data copy is not - * necessary, just swap data blocks between orig and donor. - */ - if (unwritten) { - ext4_double_down_write_data_sem(orig_inode, donor_inode); - /* If any of extents in range became initialized we have to - * fallback to data copying */ - unwritten = mext_check_coverage(orig_inode, orig_blk_offset, - block_len_in_page, 1, err); - if (*err) - goto drop_data_sem; - - unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, - block_len_in_page, 1, err); - if (*err) - goto drop_data_sem; - - if (!unwritten) { - ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto data_copy; - } - if (!filemap_release_folio(folio[0], 0) || - !filemap_release_folio(folio[1], 0)) { - *err = -EBUSY; - goto drop_data_sem; - } - replaced_count = ext4_swap_extents(handle, orig_inode, - donor_inode, orig_blk_offset, - donor_blk_offset, - block_len_in_page, 1, err); - drop_data_sem: - ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_folios; - } -data_copy: - from = offset_in_folio(folio[0], - orig_blk_offset << orig_inode->i_blkbits); - *err = mext_folio_mkuptodate(folio[0], from, from + replaced_size); - if (*err) - goto unlock_folios; - - /* At this point all buffers in range are uptodate, old mapping layout - * is no longer required, try to drop it now. */ - if (!filemap_release_folio(folio[0], 0) || - !filemap_release_folio(folio[1], 0)) { - *err = -EBUSY; - goto unlock_folios; - } - ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, - orig_blk_offset, donor_blk_offset, - block_len_in_page, 1, err); - ext4_double_up_write_data_sem(orig_inode, donor_inode); - if (*err) { - if (replaced_count) { - block_len_in_page = replaced_count; - replaced_size = - block_len_in_page << orig_inode->i_blkbits; - } else - goto unlock_folios; - } - /* Perform all necessary steps similar write_begin()/write_end() - * but keeping in mind that i_size will not change */ - bh = folio_buffers(folio[0]); - if (!bh) - bh = create_empty_buffers(folio[0], - 1 << orig_inode->i_blkbits, 0); - for (i = 0; i < from >> orig_inode->i_blkbits; i++) - bh = bh->b_this_page; - for (i = 0; i < block_len_in_page; i++) { - *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); - if (*err < 0) - goto repair_branches; - bh = bh->b_this_page; - } - - block_commit_write(folio[0], from, from + replaced_size); - - /* Even in case of data=writeback it is reasonable to pin - * inode to transaction, to prevent unexpected data loss */ - *err = ext4_jbd2_inode_add_write(handle, orig_inode, - (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); - -unlock_folios: - folio_unlock(folio[0]); - folio_put(folio[0]); - folio_unlock(folio[1]); - folio_put(folio[1]); -stop_journal: - ext4_journal_stop(handle); - if (*err == -ENOSPC && - ext4_should_retry_alloc(sb, &retries)) - goto again; - /* Buffer was busy because probably is pinned to journal transaction, - * force transaction commit may help to free it. */ - if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) - goto again; - return replaced_count; - -repair_branches: - /* - * This should never ever happen! - * Extents are swapped already, but we are not able to copy data. - * Try to swap extents to it's original places - */ - ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, - orig_blk_offset, donor_blk_offset, - block_len_in_page, 0, &err2); - ext4_double_up_write_data_sem(orig_inode, donor_inode); - if (replaced_count != block_len_in_page) { - ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), - EIO, "Unable to copy data block," - " data will be lost."); - *err = -EIO; - } - replaced_count = 0; - goto unlock_folios; -} - /* * Check the validity of the basic filesystem environment and the * inodes' support status. @@ -827,106 +563,81 @@ static int mext_check_adjust_range(struct inode *orig_inode, * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. - * */ -int -ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, - __u64 donor_blk, __u64 len, __u64 *moved_len) +int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); - struct ext4_ext_path *path = NULL; - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; - ext4_lblk_t o_end, o_start = orig_blk; - ext4_lblk_t d_start = donor_blk; + struct mext_data mext; + struct super_block *sb = orig_inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int retries = 0; + u64 m_len; int ret; + *moved_len = 0; + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); ret = mext_check_validity(orig_inode, donor_inode); if (ret) - goto unlock; + goto out; /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); - /* Protect extent tree against block allocations via delalloc */ - ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check and adjust the specified move_extent range. */ ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk, donor_blk, &len); if (ret) goto out; - o_end = o_start + len; - *moved_len = 0; - while (o_start < o_end) { - struct ext4_extent *ex; - ext4_lblk_t cur_blk, next_blk; - pgoff_t orig_page_index, donor_page_index; - int offset_in_page; - int unwritten, cur_len; + mext.orig_inode = orig_inode; + mext.donor_inode = donor_inode; + while (len) { + mext.orig_map.m_lblk = orig_blk; + mext.orig_map.m_len = len; + mext.orig_map.m_flags = 0; + mext.donor_lblk = donor_blk; + + ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0); + if (ret < 0) + goto out; + + /* Skip moving if it is a hole or a delalloc extent. */ + if (mext.orig_map.m_flags & + (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) { + ret = mext_move_extent(&mext, &m_len); + *moved_len += m_len; + if (!ret) + goto next; + + /* Move failed or partially failed. */ + if (m_len) { + orig_blk += m_len; + donor_blk += m_len; + len -= m_len; + } + if (ret == -ESTALE) + continue; + if (ret == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + continue; + if (ret == -EBUSY && + sbi->s_journal && retries++ < 4 && + jbd2_journal_force_commit_nested(sbi->s_journal)) + continue; - path = get_ext_path(orig_inode, o_start, path); - if (IS_ERR(path)) { - ret = PTR_ERR(path); goto out; } - ex = path[path->p_depth].p_ext; - cur_blk = le32_to_cpu(ex->ee_block); - cur_len = ext4_ext_get_actual_len(ex); - /* Check hole before the start pos */ - if (cur_blk + cur_len - 1 < o_start) { - next_blk = ext4_ext_next_allocated_block(path); - if (next_blk == EXT_MAX_BLOCKS) { - ret = -ENODATA; - goto out; - } - d_start += next_blk - o_start; - o_start = next_blk; - continue; - /* Check hole after the start pos */ - } else if (cur_blk > o_start) { - /* Skip hole */ - d_start += cur_blk - o_start; - o_start = cur_blk; - /* Extent inside requested range ?*/ - if (cur_blk >= o_end) - goto out; - } else { /* in_range(o_start, o_blk, o_len) */ - cur_len += cur_blk - o_start; - } - unwritten = ext4_ext_is_unwritten(ex); - if (o_end - o_start < cur_len) - cur_len = o_end - o_start; - - orig_page_index = o_start >> (PAGE_SHIFT - - orig_inode->i_blkbits); - donor_page_index = d_start >> (PAGE_SHIFT - - donor_inode->i_blkbits); - offset_in_page = o_start % blocks_per_page; - if (cur_len > blocks_per_page - offset_in_page) - cur_len = blocks_per_page - offset_in_page; - /* - * Up semaphore to avoid following problems: - * a. transaction deadlock among ext4_journal_start, - * ->write_begin via pagefault, and jbd2_journal_commit - * b. racing with ->read_folio, ->write_begin, and - * ext4_get_block in move_extent_per_page - */ - ext4_double_up_write_data_sem(orig_inode, donor_inode); - /* Swap original branches with new branches */ - *moved_len += move_extent_per_page(o_filp, donor_inode, - orig_page_index, donor_page_index, - offset_in_page, cur_len, - unwritten, &ret); - ext4_double_down_write_data_sem(orig_inode, donor_inode); - if (ret < 0) - break; - o_start += cur_len; - d_start += cur_len; +next: + orig_blk += mext.orig_map.m_len; + donor_blk += mext.orig_map.m_len; + len -= mext.orig_map.m_len; + retries = 0; } out: @@ -935,10 +646,6 @@ out: ext4_discard_preallocations(donor_inode); } - ext4_free_ext_path(path); - ext4_double_up_write_data_sem(orig_inode, donor_inode); -unlock: unlock_two_nondirectories(orig_inode, donor_inode); - return ret; } From 65097262f5ee786e649224ead2c08b7552376269 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:27 +0800 Subject: [PATCH 13/58] ext4: add large folios support for moving extents Pass the moving extent length into mext_folio_double_lock() so that it can acquire a higher-order folio if the length exceeds PAGE_SIZE. This can speed up extent moving when the extent is larger than one page. Additionally, remove the unnecessary comments from mext_folio_double_lock(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-12-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 933c2afed550..f04755c2165a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -54,23 +54,14 @@ ext4_double_up_write_data_sem(struct inode *orig_inode, up_write(&EXT4_I(donor_inode)->i_data_sem); } -/** - * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 - * - * @inode1: the inode structure - * @inode2: the inode structure - * @index1: folio index - * @index2: folio index - * @folio: result folio vector - * - * Grab two locked folio for inode's by inode order - */ -static int -mext_folio_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index1, pgoff_t index2, struct folio *folio[2]) +/* Grab and lock folio on both @inode1 and @inode2 by inode order. */ +static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, size_t len, + struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; + fgf_t fgp_flags = FGP_WRITEBEGIN; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -83,14 +74,15 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, + fgp_flags |= fgf_set_order(len); + folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, mapping_gfp_mask(mapping[0])); if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); return PTR_ERR(folio[0]); } - folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, + folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); if (IS_ERR(folio[1])) { @@ -214,7 +206,8 @@ static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits; donor_pos = ((loff_t)mext->donor_lblk) << blkbits; ret = mext_folio_double_lock(orig_inode, donor_inode, - orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT, folio); + orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT, + ((size_t)mext->orig_map.m_len) << blkbits, folio); if (ret) return ret; From 9dbf945320b11c5865d2f550f8e972566d04d181 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:28 +0800 Subject: [PATCH 14/58] ext4: add two trace points for moving extents To facilitate tracking the length, type, and outcome of the move extent, add a trace point at both the entry and exit of mext_move_extent(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-13-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 14 ++++++- include/trace/events/ext4.h | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f04755c2165a..0550fd30fd10 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -13,6 +13,8 @@ #include "ext4.h" #include "ext4_extents.h" +#include + struct mext_data { struct inode *orig_inode; /* Origin file inode */ struct inode *donor_inode; /* Donor file inode */ @@ -311,10 +313,14 @@ static int mext_move_extent(struct mext_data *mext, u64 *m_len) int ret, ret2; *m_len = 0; + trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode, + mext->donor_lblk); credits = ext4_chunk_trans_extent(orig_inode, 0) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } ret = mext_move_begin(mext, folio, &move_type); if (ret) @@ -379,6 +385,10 @@ unlock: mext_folio_double_unlock(folio); stop_handle: ext4_journal_stop(handle); +out: + trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode, + mext->donor_lblk, orig_map->m_len, *m_len, + move_type, ret); return ret; repair_branches: diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6a0754d38acf..a05bdd48e16e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -3016,6 +3016,80 @@ TRACE_EVENT(ext4_update_sb, __entry->fsblk, __entry->flags) ); +TRACE_EVENT(ext4_move_extent_enter, + TP_PROTO(struct inode *orig_inode, struct ext4_map_blocks *orig_map, + struct inode *donor_inode, ext4_lblk_t donor_lblk), + + TP_ARGS(orig_inode, orig_map, donor_inode, donor_lblk), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, orig_ino) + __field(ext4_lblk_t, orig_lblk) + __field(unsigned int, orig_flags) + __field(ino_t, donor_ino) + __field(ext4_lblk_t, donor_lblk) + __field(unsigned int, len) + ), + + TP_fast_assign( + __entry->dev = orig_inode->i_sb->s_dev; + __entry->orig_ino = orig_inode->i_ino; + __entry->orig_lblk = orig_map->m_lblk; + __entry->orig_flags = orig_map->m_flags; + __entry->donor_ino = donor_inode->i_ino; + __entry->donor_lblk = donor_lblk; + __entry->len = orig_map->m_len; + ), + + TP_printk("dev %d,%d origin ino %lu lblk %u flags %s donor ino %lu lblk %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->orig_ino, __entry->orig_lblk, + show_mflags(__entry->orig_flags), + (unsigned long) __entry->donor_ino, __entry->donor_lblk, + __entry->len) +); + +TRACE_EVENT(ext4_move_extent_exit, + TP_PROTO(struct inode *orig_inode, ext4_lblk_t orig_lblk, + struct inode *donor_inode, ext4_lblk_t donor_lblk, + unsigned int m_len, u64 move_len, int move_type, int ret), + + TP_ARGS(orig_inode, orig_lblk, donor_inode, donor_lblk, m_len, + move_len, move_type, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, orig_ino) + __field(ext4_lblk_t, orig_lblk) + __field(ino_t, donor_ino) + __field(ext4_lblk_t, donor_lblk) + __field(unsigned int, m_len) + __field(u64, move_len) + __field(int, move_type) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = orig_inode->i_sb->s_dev; + __entry->orig_ino = orig_inode->i_ino; + __entry->orig_lblk = orig_lblk; + __entry->donor_ino = donor_inode->i_ino; + __entry->donor_lblk = donor_lblk; + __entry->m_len = m_len; + __entry->move_len = move_len; + __entry->move_type = move_type; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d origin ino %lu lblk %u donor ino %lu lblk %u m_len %u, move_len %llu type %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->orig_ino, __entry->orig_lblk, + (unsigned long) __entry->donor_ino, __entry->donor_lblk, + __entry->m_len, __entry->move_len, __entry->move_type, + __entry->ret) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ From 892e1cf17555735e9d021ab036c36bc7b58b0e3b Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Mon, 20 Oct 2025 11:39:36 +0530 Subject: [PATCH 15/58] ext4: refresh inline data size before write operations The cached ei->i_inline_size can become stale between the initial size check and when ext4_update_inline_data()/ext4_create_inline_data() use it. Although ext4_get_max_inline_size() reads the correct value at the time of the check, concurrent xattr operations can modify i_inline_size before ext4_write_lock_xattr() is acquired. This causes ext4_update_inline_data() and ext4_create_inline_data() to work with stale capacity values, leading to a BUG_ON() crash in ext4_write_inline_data(): kernel BUG at fs/ext4/inline.c:1331! BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); The race window: 1. ext4_get_max_inline_size() reads i_inline_size = 60 (correct) 2. Size check passes for 50-byte write 3. [Another thread adds xattr, i_inline_size changes to 40] 4. ext4_write_lock_xattr() acquires lock 5. ext4_update_inline_data() uses stale i_inline_size = 60 6. Attempts to write 50 bytes but only 40 bytes actually available 7. BUG_ON() triggers Fix this by recalculating i_inline_size via ext4_find_inline_data_nolock() immediately after acquiring xattr_sem. This ensures ext4_update_inline_data() and ext4_create_inline_data() work with current values that are protected from concurrent modifications. This is similar to commit a54c4613dac1 ("ext4: fix race writing to an inline_data file while its xattrs are changing") which fixed i_inline_off staleness. This patch addresses the related i_inline_size staleness issue. Reported-by: syzbot+f3185be57d7e8dda32b8@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=f3185be57d7e8dda32b8 Cc: stable@kernel.org Signed-off-by: Deepanshu Kartikey Message-ID: <20251020060936.474314-1-kartikey406@gmail.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1b094a4f3866..b48c7dbe76a2 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -418,7 +418,12 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); - + /* + * ei->i_inline_size may have changed since the initial check + * if other xattrs were added. Recalculate to ensure + * ext4_update_inline_data() validates against current capacity. + */ + (void) ext4_find_inline_data_nolock(inode); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else From b97cb7d6a051aa6ebd57906df0e26e9e36c26d14 Mon Sep 17 00:00:00 2001 From: Karina Yankevich Date: Wed, 22 Oct 2025 12:32:53 +0300 Subject: [PATCH 16/58] ext4: xattr: fix null pointer deref in ext4_raw_inode() If ext4_get_inode_loc() fails (e.g. if it returns -EFSCORRUPTED), iloc.bh will remain set to NULL. Since ext4_xattr_inode_dec_ref_all() lacks error checking, this will lead to a null pointer dereference in ext4_raw_inode(), called right after ext4_get_inode_loc(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: c8e008b60492 ("ext4: ignore xattrs past end") Cc: stable@kernel.org Signed-off-by: Karina Yankevich Reviewed-by: Sergey Shtylyov Reviewed-by: Baokun Li Message-ID: <20251022093253.3546296-1-k.yankevich@omp.ru> Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ce7253b3f549..2e02efbddaac 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1174,7 +1174,11 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (block_csum) end = (void *)bh->b_data + bh->b_size; else { - ext4_get_inode_loc(parent, &iloc); + err = ext4_get_inode_loc(parent, &iloc); + if (err) { + EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err); + return; + } end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; } From 524c3853831cf4f7e1db579e487c757c3065165c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 22 Oct 2025 20:11:37 +0900 Subject: [PATCH 17/58] jbd2: use a per-journal lock_class_key for jbd2_trans_commit_key syzbot is reporting possibility of deadlock due to sharing lock_class_key for jbd2_handle across ext4 and ocfs2. But this is a false positive, for one disk partition can't have two filesystems at the same time. Reported-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6e493c165d26d6fcbf72 Signed-off-by: Tetsuo Handa Tested-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com Reviewed-by: Jan Kara Message-ID: <987110fc-5470-457a-a218-d286a09dd82f@I-love.SAKURA.ne.jp> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/journal.c | 6 ++++-- include/linux/jbd2.h | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d480b94117cd..f43474002f50 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1521,7 +1521,6 @@ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { - static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; int n; @@ -1530,6 +1529,7 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal) return ERR_PTR(-ENOMEM); + lockdep_register_key(&journal->jbd2_trans_commit_key); journal->j_blocksize = blocksize; journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -1560,7 +1560,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", - &jbd2_trans_commit_key, 0); + &journal->jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1611,6 +1611,7 @@ err_cleanup: kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); + lockdep_unregister_key(&journal->jbd2_trans_commit_key); kfree(journal); return ERR_PTR(err); } @@ -2187,6 +2188,7 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); + lockdep_unregister_key(&journal->jbd2_trans_commit_key); kfree(journal); return err; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 43b9297fe8a7..f5eaf76198f3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1253,6 +1253,12 @@ struct journal_s */ struct lockdep_map j_trans_commit_map; #endif + /** + * @jbd2_trans_commit_key: + * + * "struct lock_class_key" for @j_trans_commit_map + */ + struct lock_class_key jbd2_trans_commit_key; /** * @j_fc_cleanup_callback: From 40a71b53d5a6d4ea17e4d54b99b2ac03a7f5e783 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 24 Oct 2025 16:39:40 +0900 Subject: [PATCH 18/58] jbd2: use a weaker annotation in journal handling jbd2 journal handling code doesn't want jbd2_might_wait_for_commit() to be placed between start_this_handle() and stop_this_handle(). So it marks the region with rwsem_acquire_read() and rwsem_release(). However, the annotation is too strong for that purpose. We don't have to use more than try lock annotation for that. rwsem_acquire_read() implies: 1. might be a waiter on contention of the lock. 2. enter to the critical section of the lock. All we need in here is to act 2, not 1. So trylock version of annotation is sufficient for that purpose. Now that dept partially relies on lockdep annotaions, dept interpets rwsem_acquire_read() as a potential wait and might report a deadlock by the wait. Replace it with trylock version of annotation. Signed-off-by: Byungchul Park Reviewed-by: Jan Kara Cc: stable@kernel.org Message-ID: <20251024073940.1063-1-byungchul@sk.com> Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3e510564de6e..3a866fc87e7a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -441,7 +441,7 @@ repeat: read_unlock(&journal->j_state_lock); current->journal_info = handle; - rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); + rwsem_acquire_read(&journal->j_trans_commit_map, 0, 1, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); /* * Ensure that no allocations done while the transaction is open are From 986835bf4d11032bba4ab8414d18fce038c61bb4 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 25 Oct 2025 15:26:57 +0800 Subject: [PATCH 19/58] jbd2: avoid bug_on in jbd2_journal_get_create_access() when file system corrupted There's issue when file system corrupted: ------------[ cut here ]------------ kernel BUG at fs/jbd2/transaction.c:1289! Oops: invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 5 UID: 0 PID: 2031 Comm: mkdir Not tainted 6.18.0-rc1-next RIP: 0010:jbd2_journal_get_create_access+0x3b6/0x4d0 RSP: 0018:ffff888117aafa30 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88811a86b000 RCX: ffffffff89a63534 RDX: 1ffff110200ec602 RSI: 0000000000000004 RDI: ffff888100763010 RBP: ffff888100763000 R08: 0000000000000001 R09: ffff888100763028 R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000000 R13: ffff88812c432000 R14: ffff88812c608000 R15: ffff888120bfc000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f91d6970c99 CR3: 00000001159c4000 CR4: 00000000000006f0 Call Trace: __ext4_journal_get_create_access+0x42/0x170 ext4_getblk+0x319/0x6f0 ext4_bread+0x11/0x100 ext4_append+0x1e6/0x4a0 ext4_init_new_dir+0x145/0x1d0 ext4_mkdir+0x326/0x920 vfs_mkdir+0x45c/0x740 do_mkdirat+0x234/0x2f0 __x64_sys_mkdir+0xd6/0x120 do_syscall_64+0x5f/0xfa0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The above issue occurs with us in errors=continue mode when accompanied by storage failures. There have been many inconsistencies in the file system data. In the case of file system data inconsistency, for example, if the block bitmap of a referenced block is not set, it can lead to the situation where a block being committed is allocated and used again. As a result, the following condition will not be satisfied then trigger BUG_ON. Of course, it is entirely possible to construct a problematic image that can trigger this BUG_ON through specific operations. In fact, I have constructed such an image and easily reproduced this issue. Therefore, J_ASSERT() holds true only under ideal conditions, but it may not necessarily be satisfied in exceptional scenarios. Using J_ASSERT() directly in abnormal situations would cause the system to crash, which is clearly not what we want. So here we directly trigger a JBD abort instead of immediately invoking BUG_ON. Fixes: 470decc613ab ("[PATCH] jbd2: initial copy of files from jbd") Signed-off-by: Ye Bin Reviewed-by: Jan Kara Message-ID: <20251025072657.307851-1-yebin@huaweicloud.com> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/transaction.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3a866fc87e7a..653ee540c4a1 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1284,14 +1284,23 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committing transaction's lists, but it HAS to be in Forget state in * that case: the transaction must have deleted the buffer for it to be * reused here. + * In the case of file system data inconsistency, for example, if the + * block bitmap of a referenced block is not set, it can lead to the + * situation where a block being committed is allocated and used again. + * As a result, the following condition will not be satisfied, so here + * we directly trigger a JBD abort instead of immediately invoking + * bugon. */ spin_lock(&jh->b_state_lock); - J_ASSERT_JH(jh, (jh->b_transaction == transaction || - jh->b_transaction == NULL || - (jh->b_transaction == journal->j_committing_transaction && - jh->b_jlist == BJ_Forget))); + if (!(jh->b_transaction == transaction || jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget)) || jh->b_next_transaction != NULL) { + err = -EROFS; + spin_unlock(&jh->b_state_lock); + jbd2_journal_abort(journal, err); + goto out; + } - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { From 80d05f640a51f94b88640db7f1551f8e8fee44b9 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 31 Oct 2025 14:05:01 -0700 Subject: [PATCH 20/58] jbd2: store more accurate errno in superblock when possible When jbd2_journal_abort() is called, the provided error code is stored in the journal superblock. Some existing calls hard-code -EIO even when the actual failure is not I/O related. This patch updates those calls to pass more accurate error codes, allowing the superblock to record the true cause of failure. This helps improve diagnostics and debugging clarity when analyzing journal aborts. Signed-off-by: Wengang Wang Reviewed-by: Zhang Yi Message-ID: <20251031210501.7337-1-wen.gang.wang@oracle.com> Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/journal.c | 15 +++++++++------ fs/jbd2/transaction.c | 5 +++-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 760c9d7588be..7de15249e826 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -698,7 +698,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !ext4_emergency_ro(sb) && journal) - jbd2_journal_abort(journal, -EIO); + jbd2_journal_abort(journal, -error); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); @@ -5843,7 +5843,7 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); - jbd2_journal_abort(journal, ret ? ret : -EIO); + jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED); return ret; } *block = map.m_pblk; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2d0719bf6d87..de89c5bef607 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -113,7 +113,7 @@ __releases(&journal->j_state_lock) "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); - jbd2_journal_abort(journal, -EIO); + jbd2_journal_abort(journal, -ENOSPC); } write_lock(&journal->j_state_lock); } else { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f43474002f50..2fe1786a8f1b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -937,8 +937,8 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); + jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED); err = -EIO; - jbd2_journal_abort(journal, err); } else { *retp = block; } @@ -1859,8 +1859,9 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, if (is_journal_aborted(journal)) return -EIO; - if (jbd2_check_fs_dev_write_error(journal)) { - jbd2_journal_abort(journal, -EIO); + ret = jbd2_check_fs_dev_write_error(journal); + if (ret) { + jbd2_journal_abort(journal, ret); return -EIO; } @@ -2157,9 +2158,11 @@ int jbd2_journal_destroy(journal_t *journal) * failed to write back to the original location, otherwise the * filesystem may become inconsistent. */ - if (!is_journal_aborted(journal) && - jbd2_check_fs_dev_write_error(journal)) - jbd2_journal_abort(journal, -EIO); + if (!is_journal_aborted(journal)) { + int ret = jbd2_check_fs_dev_write_error(journal); + if (ret) + jbd2_journal_abort(journal, ret); + } if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 653ee540c4a1..dca4b5d8aaaa 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1219,7 +1219,8 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) return -EROFS; journal = handle->h_transaction->t_journal; - if (jbd2_check_fs_dev_write_error(journal)) { + rc = jbd2_check_fs_dev_write_error(journal); + if (rc) { /* * If the fs dev has writeback errors, it may have failed * to async write out metadata buffers in the background. @@ -1227,7 +1228,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) * it out again, which may lead to on-disk filesystem * inconsistency. Aborting journal can avoid it happen. */ - jbd2_journal_abort(journal, -EIO); + jbd2_journal_abort(journal, rc); return -EIO; } From ee5a977b4e771cc181f39d504426dbd31ed701cc Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 1 Nov 2025 19:04:28 +0300 Subject: [PATCH 21/58] ext4: fix string copying in parse_apply_sb_mount_options() strscpy_pad() can't be used to copy a non-NUL-term string into a NUL-term string of possibly bigger size. Commit 0efc5990bca5 ("string.h: Introduce memtostr() and memtostr_pad()") provides additional information in that regard. So if this happens, the following warning is observed: strnlen: detected buffer overflow: 65 byte read of buffer size 64 WARNING: CPU: 0 PID: 28655 at lib/string_helpers.c:1032 __fortify_report+0x96/0xc0 lib/string_helpers.c:1032 Modules linked in: CPU: 0 UID: 0 PID: 28655 Comm: syz-executor.3 Not tainted 6.12.54-syzkaller-00144-g5f0270f1ba00 #0 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:__fortify_report+0x96/0xc0 lib/string_helpers.c:1032 Call Trace: __fortify_panic+0x1f/0x30 lib/string_helpers.c:1039 strnlen include/linux/fortify-string.h:235 [inline] sized_strscpy include/linux/fortify-string.h:309 [inline] parse_apply_sb_mount_options fs/ext4/super.c:2504 [inline] __ext4_fill_super fs/ext4/super.c:5261 [inline] ext4_fill_super+0x3c35/0xad00 fs/ext4/super.c:5706 get_tree_bdev_flags+0x387/0x620 fs/super.c:1636 vfs_get_tree+0x93/0x380 fs/super.c:1814 do_new_mount fs/namespace.c:3553 [inline] path_mount+0x6ae/0x1f70 fs/namespace.c:3880 do_mount fs/namespace.c:3893 [inline] __do_sys_mount fs/namespace.c:4103 [inline] __se_sys_mount fs/namespace.c:4080 [inline] __x64_sys_mount+0x280/0x300 fs/namespace.c:4080 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x64/0x140 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x76/0x7e Since userspace is expected to provide s_mount_opts field to be at most 63 characters long with the ending byte being NUL-term, use a 64-byte buffer which matches the size of s_mount_opts, so that strscpy_pad() does its job properly. Return with error if the user still managed to provide a non-NUL-term string here. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 8ecb790ea8c3 ("ext4: avoid potential buffer over-read in parse_apply_sb_mount_options()") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: Baokun Li Reviewed-by: Jan Kara Message-ID: <20251101160430.222297-1-pchelkin@ispras.ru> Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7de15249e826..d1ba894c0e0a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2476,7 +2476,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char s_mount_opts[65]; + char s_mount_opts[64]; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; @@ -2484,7 +2484,8 @@ static int parse_apply_sb_mount_options(struct super_block *sb, if (!sbi->s_es->s_mount_opts[0]) return 0; - strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts); + if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0) + return -E2BIG; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) From 3db63d2c2d1d1e78615dd742568c5a2d55291ad1 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 1 Nov 2025 19:04:29 +0300 Subject: [PATCH 22/58] ext4: check if mount_opts is NUL-terminated in ext4_ioctl_set_tune_sb() params.mount_opts may come as potentially non-NUL-term string. Userspace is expected to pass a NUL-term string. Add an extra check to ensure this holds true. Note that further code utilizes strscpy_pad() so this is just for proper informing the user of incorrect data being provided. Found by Linux Verification Center (linuxtesting.org). Signed-off-by: Fedor Pchelkin Reviewed-by: Baokun Li Reviewed-by: Jan Kara Message-ID: <20251101160430.222297-2-pchelkin@ispras.ru> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 366a9b615bf0..7ce0fc40aec2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1394,6 +1394,10 @@ static int ext4_ioctl_set_tune_sb(struct file *filp, if (copy_from_user(¶ms, in, sizeof(params))) return -EFAULT; + if (strnlen(params.mount_opts, sizeof(params.mount_opts)) == + sizeof(params.mount_opts)) + return -E2BIG; + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) return -EOPNOTSUPP; From 6abfe107894af7e8ce3a2e120c619d81ee764ad5 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 3 Nov 2025 09:01:23 +0800 Subject: [PATCH 23/58] jbd2: fix the inconsistency between checksum and data in memory for journal sb Copying the file system while it is mounted as read-only results in a mount failure: [~]# mkfs.ext4 -F /dev/sdc [~]# mount /dev/sdc -o ro /mnt/test [~]# dd if=/dev/sdc of=/dev/sda bs=1M [~]# mount /dev/sda /mnt/test1 [ 1094.849826] JBD2: journal checksum error [ 1094.850927] EXT4-fs (sda): Could not load journal inode mount: mount /dev/sda on /mnt/test1 failed: Bad message The process described above is just an abstracted way I came up with to reproduce the issue. In the actual scenario, the file system was mounted read-only and then copied while it was still mounted. It was found that the mount operation failed. The user intended to verify the data or use it as a backup, and this action was performed during a version upgrade. Above issue may happen as follows: ext4_fill_super set_journal_csum_feature_set(sb) if (ext4_has_metadata_csum(sb)) incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; if (test_opt(sb, JOURNAL_CHECKSUM) jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); lock_buffer(journal->j_sb_buffer); sb->s_feature_incompat |= cpu_to_be32(incompat); //The data in the journal sb was modified, but the checksum was not updated, so the data remaining in memory has a mismatch between the data and the checksum. unlock_buffer(journal->j_sb_buffer); In this case, the journal sb copied over is in a state where the checksum and data are inconsistent, so mounting fails. To solve the above issue, update the checksum in memory after modifying the journal sb. Fixes: 4fd5ea43bc11 ("jbd2: checksum journal superblock") Signed-off-by: Ye Bin Reviewed-by: Baokun Li Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Message-ID: <20251103010123.3753631-1-yebin@huaweicloud.com> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/journal.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2fe1786a8f1b..c973162d5b31 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2354,6 +2354,12 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + /* + * Update the checksum now so that it is valid even for read-only + * filesystems where jbd2_write_superblock() doesn't get called. + */ + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(sb); unlock_buffer(journal->j_sb_buffer); jbd2_journal_init_transaction_limits(journal); @@ -2383,9 +2389,17 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, sb = journal->j_superblock; + lock_buffer(journal->j_sb_buffer); sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_incompat &= ~cpu_to_be32(incompat); + /* + * Update the checksum now so that it is valid even for read-only + * filesystems where jbd2_write_superblock() doesn't get called. + */ + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(sb); + unlock_buffer(journal->j_sb_buffer); jbd2_journal_init_transaction_limits(journal); } EXPORT_SYMBOL(jbd2_journal_clear_features); From 4091c8206cfd2e3bb529ef260887296b90d9b6a2 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Tue, 4 Nov 2025 16:12:24 +0800 Subject: [PATCH 24/58] ext4: clear i_state_flags when alloc inode i_state_flags used on 32-bit archs, need to clear this flag when alloc inode. Find this issue when umount ext4, sometimes track the inode as orphan accidently, cause ext4 mesg dump. Fixes: acf943e9768e ("ext4: fix checks for orphan inodes") Signed-off-by: Haibo Chen Reviewed-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251104-ext4-v1-1-73691a0800f9@nxp.com> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ialloc.c | 1 - fs/ext4/inode.c | 1 - fs/ext4/super.c | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ba4fd9aba1c1..b20a1bf866ab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1293,7 +1293,6 @@ got: ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 783c883d4d5e..32d9f0b36c33 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5296,7 +5296,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d1ba894c0e0a..707f890d674e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1396,6 +1396,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); From 0cd8feea8777f8d9b9a862b89c688b049a5c8475 Mon Sep 17 00:00:00 2001 From: Alexey Nepomnyashih Date: Tue, 4 Nov 2025 09:33:25 +0000 Subject: [PATCH 25/58] ext4: add i_data_sem protection in ext4_destroy_inline_data_nolock() Fix a race between inline data destruction and block mapping. The function ext4_destroy_inline_data_nolock() changes the inode data layout by clearing EXT4_INODE_INLINE_DATA and setting EXT4_INODE_EXTENTS. At the same time, another thread may execute ext4_map_blocks(), which tests EXT4_INODE_EXTENTS to decide whether to call ext4_ext_map_blocks() or ext4_ind_map_blocks(). Without i_data_sem protection, ext4_ind_map_blocks() may receive inode with EXT4_INODE_EXTENTS flag and triggering assert. kernel BUG at fs/ext4/indirect.c:546! EXT4-fs (loop2): unmounting filesystem. invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:ext4_ind_map_blocks.cold+0x2b/0x5a fs/ext4/indirect.c:546 Call Trace: ext4_map_blocks+0xb9b/0x16f0 fs/ext4/inode.c:681 _ext4_get_block+0x242/0x590 fs/ext4/inode.c:822 ext4_block_write_begin+0x48b/0x12c0 fs/ext4/inode.c:1124 ext4_write_begin+0x598/0xef0 fs/ext4/inode.c:1255 ext4_da_write_begin+0x21e/0x9c0 fs/ext4/inode.c:3000 generic_perform_write+0x259/0x5d0 mm/filemap.c:3846 ext4_buffered_write_iter+0x15b/0x470 fs/ext4/file.c:285 ext4_file_write_iter+0x8e0/0x17f0 fs/ext4/file.c:679 call_write_iter include/linux/fs.h:2271 [inline] do_iter_readv_writev+0x212/0x3c0 fs/read_write.c:735 do_iter_write+0x186/0x710 fs/read_write.c:861 vfs_iter_write+0x70/0xa0 fs/read_write.c:902 iter_file_splice_write+0x73b/0xc90 fs/splice.c:685 do_splice_from fs/splice.c:763 [inline] direct_splice_actor+0x10f/0x170 fs/splice.c:950 splice_direct_to_actor+0x33a/0xa10 fs/splice.c:896 do_splice_direct+0x1a9/0x280 fs/splice.c:1002 do_sendfile+0xb13/0x12c0 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x1cf/0x210 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Fixes: c755e251357a ("ext4: fix deadlock between inline_data and ext4_expand_extra_isize_ea()") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Alexey Nepomnyashih Message-ID: <20251104093326.697381-1-sdl@nppct.ru> Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b48c7dbe76a2..1f6bc05593df 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -451,9 +451,13 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (!ei->i_inline_off) return 0; + down_write(&ei->i_data_sem); + error = ext4_get_inode_loc(inode, &is.iloc); - if (error) + if (error) { + up_write(&ei->i_data_sem); return error; + } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) @@ -492,6 +496,7 @@ out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; + up_write(&ei->i_data_sem); return error; } From 3f7a79d05c692c7cfec70bf104b1b3c3d0ce6247 Mon Sep 17 00:00:00 2001 From: Yongjian Sun Date: Thu, 6 Nov 2025 14:06:13 +0800 Subject: [PATCH 26/58] ext4: fix incorrect group number assertion in mb_check_buddy When the MB_CHECK_ASSERT macro is enabled, an assertion failure can occur in __mb_check_buddy when checking preallocated blocks (pa) in a block group: Assertion failure in mb_free_blocks() : "groupnr == e4b->bd_group" This happens when a pa at the very end of a block group (e.g., pa_pstart=32765, pa_len=3 in a group of 32768 blocks) becomes exhausted - its pa_pstart is advanced by pa_len to 32768, which lies in the next block group. If this exhausted pa (with pa_len == 0) is still in the bb_prealloc_list during the buddy check, the assertion incorrectly flags it as belonging to the wrong group. A possible sequence is as follows: ext4_mb_new_blocks ext4_mb_release_context pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len) pa->pa_len -= ac->ac_b_ex.fe_len __mb_check_buddy for each pa in group ext4_get_group_no_and_offset MB_CHECK_ASSERT(groupnr == e4b->bd_group) To fix this, we modify the check to skip block group validation for exhausted preallocations (where pa_len == 0). Such entries are in a transitional state and will be removed from the list soon, so they should not trigger an assertion. This change prevents the false positive while maintaining the integrity of the checks for active allocations. Fixes: c9de560ded61f ("ext4: Add multi block allocator for ext4") Signed-off-by: Yongjian Sun Reviewed-by: Baokun Li Reviewed-by: Jan Kara Message-ID: <20251106060614.631382-2-sunyongjian@huaweicloud.com> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9087183602e4..194a9f995c36 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -768,6 +768,8 @@ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + if (!pa->pa_len) + continue; ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) From d9ee3ff810f1cc0e253c9f2b17b668b973cb0e06 Mon Sep 17 00:00:00 2001 From: Yongjian Sun Date: Thu, 6 Nov 2025 14:06:14 +0800 Subject: [PATCH 27/58] ext4: improve integrity checking in __mb_check_buddy by enhancing order-0 validation When the MB_CHECK_ASSERT macro is enabled, we found that the current validation logic in __mb_check_buddy has a gap in detecting certain invalid buddy states, particularly related to order-0 (bitmap) bits. The original logic consists of three steps: 1. Validates higher-order buddies: if a higher-order bit is set, at most one of the two corresponding lower-order bits may be free; if a higher-order bit is clear, both lower-order bits must be allocated (and their bitmap bits must be 0). 2. For any set bit in order-0, ensures all corresponding higher-order bits are not free. 3. Verifies that all preallocated blocks (pa) in the group have pa_pstart within bounds and their bitmap bits marked as allocated. However, this approach fails to properly validate cases where order-0 bits are incorrectly cleared (0), allowing some invalid configurations to pass: corrupt integral order 3 1 1 order 2 1 1 1 1 order 1 1 1 1 1 1 1 1 1 order 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 Here we get two adjacent free blocks at order-0 with inconsistent higher-order state, and the right one shows the correct scenario. The root cause is insufficient validation of order-0 zero bits. To fix this and improve completeness without significant performance cost, we refine the logic: 1. Maintain the top-down higher-order validation, but we no longer check the cases where the higher-order bit is 0, as this case will be covered in step 2. 2. Enhance order-0 checking by examining pairs of bits: - If either bit in a pair is set (1), all corresponding higher-order bits must not be free. - If both bits are clear (0), then exactly one of the corresponding higher-order bits must be free 3. Keep the preallocation (pa) validation unchanged. This change closes the validation gap, ensuring illegal buddy states involving order-0 are correctly detected, while removing redundant checks and maintaining efficiency. Fixes: c9de560ded61f ("ext4: Add multi block allocator for ext4") Suggested-by: Jan Kara Signed-off-by: Yongjian Sun Reviewed-by: Baokun Li Reviewed-by: Jan Kara Message-ID: <20251106060614.631382-3-sunyongjian@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 49 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 194a9f995c36..65335248825c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -682,6 +682,24 @@ do { \ } \ } while (0) +/* + * Perform buddy integrity check with the following steps: + * + * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap): + * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block), + * at most one of the two corresponding lower-order bits may be clear (free). + * + * 2. Order-0 (bitmap) validation, performed on bit pairs: + * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits + * must not be free (0). + * - If both bits in a pair are clear (0, free), then exactly one of the corresponding + * higher-order bits must be free (0). + * + * 3. Preallocation (pa) list validation: + * For each preallocated block (pa) in the group: + * - Verify that pa_pstart falls within the bounds of this block group. + * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1). + */ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { @@ -723,15 +741,6 @@ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 1 */ - MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); - MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); - - for (j = 0; j < (1 << order); j++) { - k = (i * (1 << order)) + j; - MB_CHECK_ASSERT( - !mb_test_bit(k, e4b->bd_bitmap)); - } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); @@ -747,15 +756,21 @@ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, fragments++; fstart = i; } - continue; + } else { + fstart = -1; } - fstart = -1; - /* check used bits only */ - for (j = 0; j < e4b->bd_blkbits + 1; j++) { - buddy2 = mb_find_buddy(e4b, j, &max2); - k = i >> j; - MB_CHECK_ASSERT(k < max2); - MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + if (!(i & 1)) { + int in_use, zero_bit_count = 0; + + in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy); + for (j = 1; j < e4b->bd_blkbits + 2; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + if (!mb_test_bit(k, buddy2)) + zero_bit_count++; + } + MB_CHECK_ASSERT(zero_bit_count == !in_use); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); From dac092195b6a35bc7c9f11e2884cfecb1b25e20c Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 12 Nov 2025 16:45:36 +0800 Subject: [PATCH 28/58] ext4: rename EXT4_GET_BLOCKS_PRE_IO This flag has been generalized to split an unwritten extent when we do dio or dioread_nolock writeback, or to avoid merge new extents which was created by extents split. Update some related comments too. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Yang Erkun Message-ID: <20251112084538.1658232-2-yangerkun@huawei.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 21 +++++++++++++++------ fs/ext4/extents.c | 16 ++++++++-------- fs/ext4/inode.c | 2 +- include/trace/events/ext4.h | 2 +- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f127aedbaee..9df4f3ddfe42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -695,13 +695,22 @@ enum { /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* caller is from the direct IO path, request to creation of an - unwritten extents if not allocated, split the unwritten - extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_PRE_IO 0x0008 -#define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + /* + * This means that we cannot merge newly allocated extents, and if we + * found an unwritten extent, we need to split it. + */ +#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 + /* + * Caller is from the dio or dioread_nolock buffered IO, reqest to + * create an unwritten extent if it does not exist or split the + * found unwritten extent. Also do not merge the newly created + * unwritten extent, io end will convert unwritten to written, + * and try to merge the written extent. + */ +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert unwritten extent to initialized. */ +#define EXT4_GET_BLOCKS_CONVERT 0x0010 /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c7d219e6c6d8..b9be5d8320de 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -333,7 +333,7 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, int nofail) { int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); - int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; @@ -2002,7 +2002,7 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } /* try to insert block into found extent and return */ - if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) { /* * Try to see whether we should rather test the extent on @@ -2181,7 +2181,7 @@ has_space: merge: /* try to merge extents */ - if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ @@ -3224,7 +3224,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, else ext4_ext_mark_initialized(ex); - if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); @@ -3368,7 +3368,7 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; - flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; @@ -3739,7 +3739,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } - flags |= EXT4_GET_BLOCKS_PRE_IO; + flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; return ext4_split_extent(handle, inode, path, map, split_flag, flags, allocated); } @@ -3911,7 +3911,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, *allocated, newblock); /* get_block() before submitting IO, split the extent */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) { + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { path = ext4_split_convert_extents(handle, inode, map, path, flags | EXT4_GET_BLOCKS_CONVERT, allocated); if (IS_ERR(path)) @@ -5618,7 +5618,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) path = ext4_split_extent_at(handle, inode, path, start_lblk, split_flag, EXT4_EX_NOCACHE | - EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_SPLIT_NOMERGE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 32d9f0b36c33..3883793425cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -653,7 +653,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * If the extent has been zeroed out, we don't need to update * extent status tree. */ - if (flags & EXT4_GET_BLOCKS_PRE_IO && + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es)) return retval; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index a05bdd48e16e..fd76d14c2776 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -39,7 +39,7 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ - { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ + { EXT4_GET_BLOCKS_SPLIT_NOMERGE, "SPLIT_NOMERGE" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ From a9272422316f6c0ddbdfd03e695079e2b3655995 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 12 Nov 2025 16:45:37 +0800 Subject: [PATCH 29/58] ext4: cleanup for ext4_map_blocks Retval from ext4_map_create_blocks means we really create some blocks, cannot happened with m_flags without EXT4_MAP_UNWRITTEN and EXT4_MAP_MAPPED. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Yang Erkun Message-ID: <20251112084538.1658232-3-yangerkun@huawei.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3883793425cb..8e694c56d3b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -816,7 +816,13 @@ found: down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); + if (retval <= 0) + return retval; + + if (map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; @@ -845,12 +851,8 @@ found: return ret; } } - if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || - map->m_flags & EXT4_MAP_MAPPED)) - ext4_fc_track_range(handle, inode, map->m_lblk, - map->m_lblk + map->m_len - 1); - if (retval < 0) - ext_debug(inode, "failed with err %d\n", retval); + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + + map->m_len - 1); return retval; } From cc742fd1d184bb2a11bacf50587d2c85290622e4 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 12 Nov 2025 16:45:38 +0800 Subject: [PATCH 30/58] ext4: correct the comments place for EXT4_EXT_MAY_ZEROOUT Move the comments just before we set EXT4_EXT_MAY_ZEROOUT in ext4_split_convert_extents. Signed-off-by: Yang Erkun Message-ID: <20251112084538.1658232-4-yangerkun@huawei.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b9be5d8320de..885e933b6803 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3721,10 +3721,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully inside i_size or new_size. - */ depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); @@ -3735,6 +3731,10 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); From 4ada1e4f8937f1ae6b620e7b1c76e02b9b3a15be Mon Sep 17 00:00:00 2001 From: Haodong Tian Date: Wed, 12 Nov 2025 23:59:16 +0800 Subject: [PATCH 31/58] fs/ext4: fix typo in comment Correct 'metdata' -> 'metadata' in comment. Signed-off-by: Haodong Tian Reviewed-by: Darrick J. Wong Message-ID: <20251112155916.3007639-1-tianhd25@mails.tsinghua.edu.cn> Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c9329ed5c094..8040c731b3e4 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -752,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, *count = ar.len; /* * Account for the allocated meta blocks. We will never - * fail EDQUOT for metdata, but we do account for it. + * fail EDQUOT for metadata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, From 39fc6d4d3527d790f090dcb10bdb82fd1a1d925a Mon Sep 17 00:00:00 2001 From: Daniel Tang Date: Wed, 19 Nov 2025 09:32:13 -0500 Subject: [PATCH 32/58] Documentation: ext4: Document casefold and encrypt flags Based on ext4(5) and fs/ext4/ext4.h. For INCOMPAT_ENCRYPT, it's possible to create a new filesystem with that flag without creating any encrypted inodes. ext4(5) says it adds "support" but doesn't say whether anything's actually present like COMPAT_RESIZE_INODE does. Signed-off-by: Daniel Tang Message-ID: <4506189.9SDvczpPoe@daniel-desktop3> Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/inodes.rst | 2 ++ Documentation/filesystems/ext4/super.rst | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index cfc6c1659931..55cd5c380e92 100644 --- a/Documentation/filesystems/ext4/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst @@ -297,6 +297,8 @@ The ``i_flags`` field is a combination of these values: - Inode has inline data (EXT4_INLINE_DATA_FL). * - 0x20000000 - Create children with the same project ID (EXT4_PROJINHERIT_FL). + * - 0x40000000 + - Use case-insensitive lookups for directory contents (EXT4_CASEFOLD_FL). * - 0x80000000 - Reserved for ext4 library (EXT4_RESERVED_FL). * - diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index 1b240661bfa3..9a59cded9bd7 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -671,7 +671,9 @@ following: * - 0x8000 - Data in inode (INCOMPAT_INLINE_DATA). * - 0x10000 - - Encrypted inodes are present on the filesystem. (INCOMPAT_ENCRYPT). + - Encrypted inodes can be present. (INCOMPAT_ENCRYPT). + * - 0x20000 + - Directories can be marked case-insensitive. (INCOMPAT_CASEFOLD). .. _super_rocompat: From 7c11c56eb32eae96893eebafdbe3decadefe88ad Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 20 Nov 2025 21:42:33 +0800 Subject: [PATCH 33/58] ext4: align max orphan file size with e2fsprogs limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel commit 0a6ce20c1564 ("ext4: verify orphan file size is not too big") limits the maximum supported orphan file size to 8 << 20. However, in e2fsprogs, the orphan file size is set to 32–512 filesystem blocks when creating a filesystem. With 64k block size, formatting an ext4 fs >32G gives an orphan file bigger than the kernel allows, so mount prints an error and fails: EXT4-fs (vdb): orphan file too big: 8650752 EXT4-fs (vdb): mount failed To prevent this issue and allow previously created 64KB filesystems to mount, we updates the maximum allowed orphan file size in the kernel to 512 filesystem blocks. Fixes: 0a6ce20c1564 ("ext4: verify orphan file size is not too big") Signed-off-by: Baokun Li Reviewed-by: Jan Kara Message-ID: <20251120134233.2994147-1-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/orphan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 82d5e7501455..fb57bba0d19d 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -8,6 +8,8 @@ #include "ext4.h" #include "ext4_jbd2.h" +#define EXT4_MAX_ORPHAN_FILE_BLOCKS 512 + static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; @@ -588,7 +590,7 @@ int ext4_init_orphan_info(struct super_block *sb) * consuming absurd amounts of memory when pinning blocks of orphan * file in memory. */ - if (inode->i_size > 8 << 20) { + if (inode->i_size > (EXT4_MAX_ORPHAN_FILE_BLOCKS << inode->i_blkbits)) { ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", (unsigned long long)inode->i_size); ret = -EFSCORRUPTED; From 5835b1339e33549d9e7342fae56243b4fcd758c9 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 21 Nov 2025 17:06:31 +0800 Subject: [PATCH 34/58] ext4: remove page offset calculation in ext4_block_zero_page_range() For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-2-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8e694c56d3b6..4afe227fd03f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4167,9 +4167,8 @@ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; - unsigned max = blocksize - (offset & (blocksize - 1)); + unsigned int max = blocksize - (from & (blocksize - 1)); /* * correct length if it does not fall between From b73f45a32420a8393e92fb2dec3b7d109e565127 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:32 +0800 Subject: [PATCH 35/58] ext4: remove page offset calculation in ext4_block_truncate_page() For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-3-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4afe227fd03f..d232154cc14d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4193,7 +4193,6 @@ static int ext4_block_zero_page_range(handle_t *handle, static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { - unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; @@ -4202,8 +4201,8 @@ static int ext4_block_truncate_page(handle_t *handle, if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + blocksize = i_blocksize(inode); + length = blocksize - (from & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } From afa6d5a16bf2e354e183a4fcbcdb8578798e9942 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:33 +0800 Subject: [PATCH 36/58] ext4: remove PAGE_SIZE checks for rec_len conversion Previously, ext4_rec_len_(to|from)_disk only performed complex rec_len conversions when PAGE_SIZE >= 65536 to reduce complexity. However, we are soon to support file system block sizes greater than page size, which makes these conditional checks unnecessary. Thus, these checks are now removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-4-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9df4f3ddfe42..6c2540e94c78 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2484,28 +2484,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len, return (rec_len & ~EXT4_DIR_ROUND); } -/* - * If we ever get support for fs block sizes > page_size, we'll need - * to remove the #if statements in the next two functions... - */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); -#else - return len; -#endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); -#if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { @@ -2515,9 +2506,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -#else - return cpu_to_le16(len); -#endif } /* From d37a7ddd3a384bd34f985273d6e776d3d50b0edd Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:34 +0800 Subject: [PATCH 37/58] ext4: make ext4_punch_hole() support large block size When preparing for bs > ps support, clean up unnecessary PAGE_SIZE references in ext4_punch_hole(). Previously, when a hole extended beyond i_size, we aligned the hole end upwards to PAGE_SIZE to handle partial folio invalidation. Now that truncate_inode_pages_range() already handles partial folio invalidation correctly, this alignment is no longer required. However, to save pointless tail block zeroing, we still keep rounding up to the block size here. In addition, as Honza pointed out, when the hole end equals i_size, it should also be rounded up to the block size. This patch fixes that as well. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-5-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d232154cc14d..5cf392142c8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4408,10 +4408,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size. + * the block that contains i_size to save pointless tail block zeroing. */ - if (end > inode->i_size) - end = round_up(inode->i_size, PAGE_SIZE); + if (end >= inode->i_size) + end = round_up(inode->i_size, sb->s_blocksize); if (end > max_end) end = max_end; length = end - offset; From 58297412edf077870eedce2481db5755b4e98474 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:35 +0800 Subject: [PATCH 38/58] ext4: enable DIOREAD_NOLOCK by default for BS > PS as well The dioread_nolock related processes already support large folio, so dioread_nolock is enabled by default regardless of whether the blocksize is less than, equal to, or greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-6-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 707f890d674e..f39679892e71 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4392,8 +4392,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize <= PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) From 8611e608a8fa01e8b82c9008b4dac9f24531ae0f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:36 +0800 Subject: [PATCH 39/58] ext4: introduce s_min_folio_order for future BS > PS support This commit introduces the s_min_folio_order field to the ext4_sb_info structure. This field will store the minimum folio order required by the current filesystem, laying groundwork for future support of block sizes greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Pankaj Raghav Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-7-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +++ fs/ext4/inode.c | 3 ++- fs/ext4/super.c | 10 +++++----- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6c2540e94c78..0502afa6f6e5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1697,6 +1697,9 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; + /* minimum folio order of a page cache allocation */ + unsigned int s_min_folio_order; + /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cf392142c8c..ff697adab5f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5183,7 +5183,8 @@ void ext4_set_inode_mapping_order(struct inode *inode) if (!ext4_should_enable_large_folio(inode)) return; - mapping_set_folio_order_range(inode->i_mapping, 0, + mapping_set_folio_order_range(inode->i_mapping, + EXT4_SB(inode->i_sb)->s_min_folio_order, EXT4_MAX_PAGECACHE_ORDER(inode)); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f39679892e71..2286cd1e3cb8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5109,11 +5109,8 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, * If the default block size is not the same as the real block size, * we need to reload it. */ - if (sb->s_blocksize == blocksize) { - *lsb = logical_sb_block; - sbi->s_sbh = bh; - return 0; - } + if (sb->s_blocksize == blocksize) + goto success; /* * bh must be released before kill_bdev(), otherwise @@ -5144,6 +5141,9 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } + +success: + sbi->s_min_folio_order = get_order(blocksize); *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; From 6a28b5c9908d6e7ea13eae7a5872e8e081a397c4 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:37 +0800 Subject: [PATCH 40/58] ext4: support large block size in ext4_calculate_overhead() ext4_calculate_overhead() used a single page for its bitmap buffer, which worked fine when PAGE_SIZE >= block size. However, with block size greater than page size (BS > PS) support, the bitmap can exceed a single page. To address this, we now use kvmalloc() to allocate memory of the filesystem block size, to properly support BS > PS. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-8-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2286cd1e3cb8..998ddce49093 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4191,7 +4191,7 @@ int ext4_calculate_overhead(struct super_block *sb) unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_NOFS); + char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO); if (!buf) return -ENOMEM; @@ -4216,7 +4216,7 @@ int ext4_calculate_overhead(struct super_block *sb) blks = count_overhead(sb, i, buf); overhead += blks; if (blks) - memset(buf, 0, PAGE_SIZE); + memset(buf, 0, sb->s_blocksize); cond_resched(); } @@ -4239,7 +4239,7 @@ int ext4_calculate_overhead(struct super_block *sb) } sbi->s_overhead = overhead; smp_wmb(); - free_page((unsigned long) buf); + kvfree(buf); return 0; } From 609c5e0081b432caaa557ffcf1318aefe1187c4e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:38 +0800 Subject: [PATCH 41/58] ext4: support large block size in ext4_readdir() In ext4_readdir(), page_cache_sync_readahead() is used to readahead mapped physical blocks. With LBS support, this can lead to a negative right shift. To fix this, the page index is now calculated by first converting the physical block number (pblk) to a file position (pos) before converting it to a page index. Also, the correct number of pages to readahead is now passed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Pankaj Raghav Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-9-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d4164c507a90..256fe2c1d4c1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -192,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) continue; } if (err > 0) { - pgoff_t index = map.m_pblk >> - (PAGE_SHIFT - inode->i_blkbits); + pgoff_t index = map.m_pblk << inode->i_blkbits >> + PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, - &file->f_ra, file, - index, 1); + &file->f_ra, file, index, + 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { From 125d1f6a5a77ed6a1af3eb0957240f54e4124af2 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:39 +0800 Subject: [PATCH 42/58] ext4: add EXT4_LBLK_TO_B macro for logical block to bytes conversion No functional changes. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-10-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 20 +++++++++----------- fs/ext4/namei.c | 8 +++----- fs/ext4/verity.c | 2 +- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0502afa6f6e5..4581ff8aef19 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -368,6 +368,7 @@ struct ext4_io_submit { blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) +#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 885e933b6803..2cf5759ba689 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4562,7 +4562,7 @@ retry: * allow a full retry cycle for any remaining allocations */ retries = 0; - epos = (loff_t)(map.m_lblk + ret) << blkbits; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff697adab5f6..29259e10d78e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -837,9 +837,8 @@ found: !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { - loff_t start_byte = - (loff_t)map->m_lblk << inode->i_blkbits; - loff_t length = (loff_t)map->m_len << inode->i_blkbits; + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, @@ -2235,7 +2234,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; - int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); @@ -2261,7 +2259,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, + mpd->map.m_lblk); } *map_bh = true; goto out; @@ -2271,7 +2270,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - io_end_size += (1 << blkbits); + io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; @@ -2473,7 +2472,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3513,8 +3512,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = (u64) map->m_lblk << blkbits; - iomap->length = (u64) map->m_len << blkbits; + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3688,7 +3687,6 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; - u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; @@ -3747,7 +3745,7 @@ retry: * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) + else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 045616033515..c4b5e252af0e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1076,7 +1076,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, - (block<i_sb)) + EXT4_LBLK_TO_B(dir, block) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; @@ -1630,7 +1630,7 @@ restart: } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; @@ -1710,7 +1710,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { - struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; @@ -1729,8 +1728,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, goto errout; retval = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), - res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (retval == 1) goto success; brelse(bh); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b0acb0c50313..415d9c4d8a32 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -302,7 +302,7 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); - desc_size_pos = (u64)end_lblk << inode->i_blkbits; + desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk); ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) From 2a8de76b2b0f84333a2778db04ce51811c260d9d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:40 +0800 Subject: [PATCH 43/58] ext4: add EXT4_LBLK_TO_PG and EXT4_PG_TO_LBLK for block/page conversion As BS > PS support is coming, all block number to page index (and vice-versa) conversions must now go via bytes. Added EXT4_LBLK_TO_PG() and EXT4_PG_TO_LBLK() macros to simplify these conversions and handle both BS <= PS and BS > PS scenarios cleanly. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-11-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4581ff8aef19..5ade4e22dcff 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -370,6 +370,12 @@ struct ext4_io_submit { (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) #define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) +/* Translate a block number to a page index */ +#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ + PAGE_SHIFT) +/* Translate a page index to a block number */ +#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ + (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ From 6117f1806a7328e8d316eeeac66b2ba4e4539ba5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:41 +0800 Subject: [PATCH 44/58] ext4: support large block size in ext4_mb_load_buddy_gfp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, ext4_mb_load_buddy_gfp() uses blocks_per_page to calculate the folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. To support BS > PS, use bytes to compute folio index and offset within folio to get rid of blocks_per_page. Also, if buddy and bitmap land in the same folio, we get that folio’s ref instead of looking it up again before updating the buddy. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-12-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 65335248825c..38af89f5da56 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1659,17 +1659,15 @@ err: /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { - int blocks_per_page; int block; int pnum; - int poff; struct folio *folio; int ret; struct ext4_group_info *grp; @@ -1679,7 +1677,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, might_sleep(); mb_debug(sb, "load group %u\n", group); - blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; @@ -1707,8 +1704,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_PG(inode, block); /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); @@ -1740,7 +1736,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + - (poff * sb->s_blocksize)); + offset_in_folio(folio, + EXT4_LBLK_TO_B(inode, block))); } folio_unlock(folio); } @@ -1756,12 +1753,18 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_PG(inode, block); + /* buddy and bitmap are on the same folio? */ + if (folio_contains(folio, pnum)) { + folio_get(folio); + goto update_buddy; + } + /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) @@ -1796,9 +1799,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } +update_buddy: /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; - e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_buddy = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); return 0; From 3938fc29f89fff132929b2fea2fe00b0f43617ca Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:42 +0800 Subject: [PATCH 45/58] ext4: support large block size in ext4_mb_get_buddy_page_lock() Currently, ext4_mb_get_buddy_page_lock() uses blocks_per_page to calculate folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. To support BS > PS, use bytes to compute folio index and offset within folio to get rid of blocks_per_page. Also, since ext4_mb_get_buddy_page_lock() already fully supports folio, rename it to ext4_mb_get_buddy_folio_lock(). Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-13-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 38af89f5da56..c1941afce6e2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1527,50 +1527,52 @@ out: } /* - * Lock the buddy and bitmap pages. This make sure other parallel init_group - * on the same buddy page doesn't happen whild holding the buddy page lock. - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. + * Lock the buddy and bitmap folios. This makes sure other parallel init_group + * on the same buddy folio doesn't happen while holding the buddy folio lock. + * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap + * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. */ -static int ext4_mb_get_buddy_page_lock(struct super_block *sb, +static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; - int block, pnum, poff; - int blocks_per_page; + int block, pnum; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; - blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_PG(inode, block); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_bitmap_folio = folio; - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); - if (blocks_per_page >= 2) { - /* buddy and bitmap are on the same page */ + block++; + pnum = EXT4_LBLK_TO_PG(inode, block); + if (folio_contains(folio, pnum)) { + /* buddy and bitmap are on the same folio */ return 0; } - /* blocks_per_page == 1, hence we need another page for the buddy */ - folio = __filemap_get_folio(inode->i_mapping, block + 1, + /* we need another folio for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_buddy_folio = folio; return 0; } @@ -1609,14 +1611,14 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) /* * This ensures that we don't reinit the buddy cache - * page which map to the group from which we are already + * folio which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have pinned buddy page to page cache. - * The call to ext4_mb_get_buddy_page_lock will mark the - * page accessed. + * would have pinned buddy folio to page cache. + * The call to ext4_mb_get_buddy_folio_lock will mark the + * folio accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); + ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group From 0ad55fa104a2aa5980c12f88546fab328d34644b Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:43 +0800 Subject: [PATCH 46/58] ext4: support large block size in ext4_mb_init_cache() Currently, ext4_mb_init_cache() uses blocks_per_page to calculate the folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. Since we now have the folio, we know its exact size. This allows us to convert {blocks, groups}_per_page to {blocks, groups}_per_folio, thus supporting block sizes greater than page size. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-14-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c1941afce6e2..4801cea6badd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1346,26 +1346,25 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * block bitmap and buddy information. The information are * stored in the inode as * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. - * So for each group we take up 2 blocks. A page can - * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. - * So it can have information regarding groups_per_page which - * is blocks_per_page/2 + * So for each group we take up 2 blocks. A folio can + * contain blocks_per_folio (folio_size / blocksize) blocks. + * So it can have information regarding groups_per_folio which + * is blocks_per_folio/2 * * Locking note: This routine takes the block group lock of all groups - * for this page; do not hold this lock when calling this routine! + * for this folio; do not hold this lock when calling this routine! */ - static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; - int blocks_per_page; - int groups_per_page; + int blocks_per_folio; + int groups_per_folio; int err = 0; int i; ext4_group_t first_group, group; @@ -1382,27 +1381,24 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); - blocks_per_page = PAGE_SIZE / blocksize; + blocks_per_folio = folio_size(folio) / blocksize; + WARN_ON_ONCE(!blocks_per_folio); + groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); mb_debug(sb, "init folio %lu\n", folio->index); - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* allocate buffer_heads to read bitmaps */ - if (groups_per_page > 1) { - i = sizeof(struct buffer_head *) * groups_per_page; + if (groups_per_folio > 1) { + i = sizeof(struct buffer_head *) * groups_per_folio; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; - first_group = folio->index * blocks_per_page / 2; - /* read all groups the folio covers into the cache */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2; + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { if (group >= ngroups) break; @@ -1410,7 +1406,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) if (!grinfo) continue; /* - * If page is uptodate then we came here after online resize + * If folio is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. @@ -1430,7 +1426,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) } /* wait for I/O completion */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { int err2; if (!bh[i]) @@ -1440,8 +1436,8 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) err = err2; } - first_block = folio->index * blocks_per_page; - for (i = 0; i < blocks_per_page; i++) { + first_block = EXT4_PG_TO_LBLK(inode, folio->index); + for (i = 0; i < blocks_per_folio; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1518,7 +1514,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) out: if (bh) { - for (i = 0; i < groups_per_page; i++) + for (i = 0; i < groups_per_folio; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); From 31daa8261c54404513cf7ac81d1f79ff1cdbc36e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:44 +0800 Subject: [PATCH 47/58] ext4: prepare buddy cache inode for BS > PS with large folios We use EXT4_BAD_INO for the buddy cache inode number. This inode is not accessed via __ext4_new_inode() or __ext4_iget(), meaning ext4_set_inode_mapping_order() is not called to set its folio order range. However, future block size greater than page size support requires this inode to support large folios, and the buddy cache code already handles BS > PS. Therefore, ext4_set_inode_mapping_order() is now explicitly called for this specific inode to set its folio order range. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-15-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4801cea6badd..c4ac3142c0d7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3510,6 +3510,8 @@ static int ext4_mb_init_backend(struct super_block *sb) * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + ext4_set_inode_mapping_order(sbi->s_buddy_cache); + for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); From 65c39954bb92b3f2ab5fb179225b8c8788c79afd Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 21 Nov 2025 17:06:45 +0800 Subject: [PATCH 48/58] ext4: rename 'page' references to 'folio' in multi-block allocator The ext4 multi-block allocator now fully supports folio objects. Update all variable names, function names, and comments to replace legacy 'page' terminology with 'folio', improving clarity and consistency. No functional changes. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-16-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c4ac3142c0d7..56d50fd3310b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -98,14 +98,14 @@ * block bitmap and buddy information. The information are stored in the * inode as: * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we - * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / - * blocksize) blocks. So it can have information regarding groups_per_page - * which is blocks_per_page/2 + * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / + * blocksize) blocks. So it can have information regarding groups_per_folio + * which is blocks_per_folio/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. @@ -1573,7 +1573,7 @@ static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, return 0; } -static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); @@ -1587,7 +1587,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack @@ -1635,7 +1635,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in - * the same page we don't need to force + * the same folio we don't need to force * init the buddy */ ret = 0; @@ -1651,7 +1651,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } err: - ext4_mb_put_buddy_page_lock(&e4b); + ext4_mb_put_buddy_folio_lock(&e4b); return ret; } @@ -2244,7 +2244,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_buddy = ret >> 16; /* - * take the page reference. We want the page to be pinned + * take the folio reference. We want the folio to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped @@ -2950,7 +2950,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac, if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) return 0; - /* This now checks without needing the buddy page */ + /* This now checks without needing the buddy folio */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!ac->ac_first_err) @@ -4742,7 +4742,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the - * pages in the ext4_allocation_context so + * folios in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; From a6d73242b8b5caa9f9a529eab49cc1e85ace9890 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:46 +0800 Subject: [PATCH 49/58] ext4: support large block size in ext4_mpage_readpages() Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-17-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f329daf6e5c7..e7f2350c725b 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -213,9 +213,7 @@ int ext4_mpage_readpages(struct inode *inode, { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; @@ -251,9 +249,8 @@ int ext4_mpage_readpages(struct inode *inode, blocks_per_folio = folio_size(folio) >> blkbits; first_hole = blocks_per_folio; - block_in_file = next_block = - (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + nr_pages * blocks_per_page; + block_in_file = next_block = EXT4_PG_TO_LBLK(inode, folio->index); + last_block = EXT4_PG_TO_LBLK(inode, folio->index + nr_pages); last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) From bff6235d623a022260b8af5559ced3534fb7fc2e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:47 +0800 Subject: [PATCH 50/58] ext4: support large block size in ext4_block_write_begin() Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-18-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 29259e10d78e..269c1ded169b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1170,8 +1170,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, unsigned block_start, block_end; sector_t block; int err = 0; - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned bbits; + unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; @@ -1180,12 +1179,12 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); + WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); - bbits = ilog2(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + block = EXT4_PG_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { From b967ab748765bf2cf9512efaa8aa987ab4482c7d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:48 +0800 Subject: [PATCH 51/58] ext4: support large block size in mpage_map_and_submit_buffers() Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-19-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 269c1ded169b..847770f57bfc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2299,15 +2299,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; - start = mpd->map.m_lblk >> bpp_bits; - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); + end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2318,7 +2317,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - lblk = folio->index << bpp_bits; + lblk = EXT4_PG_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* From 8e50e23b769ace4885fc132e6fca2b4343c27fb1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:49 +0800 Subject: [PATCH 52/58] ext4: support large block size in mpage_prepare_extent_to_map() Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-20-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 847770f57bfc..8cf33a58d1a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2620,7 +2620,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; - int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; @@ -2659,7 +2658,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ @@ -2737,8 +2736,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)folio->index) << - (PAGE_SHIFT - blkbits); + lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); From c00a6292d0616c304cb712d823370f1a82f899b2 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 21 Nov 2025 17:06:50 +0800 Subject: [PATCH 53/58] ext4: support large block size in __ext4_block_zero_page_range() Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-21-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8cf33a58d1a6..3b6f66463add 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4076,7 +4076,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = EXT4_PG_TO_LBLK(inode, folio->index); bh = folio_buffers(folio); if (!bh) From 58fd191f99f3791c6687e98041c89a6477d9f64d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:51 +0800 Subject: [PATCH 54/58] ext4: make data=journal support large block size Currently, ext4_set_inode_mapping_order() does not set max folio order for files with the data journalling flag. For files that already have large folios enabled, ext4_inode_journal_mode() ignores the data journalling flag once max folio order is set. This is not because data journalling cannot work with large folios, but because credit estimates will go through the roof if there are too many blocks per folio. Since the real constraint is blocks-per-folio, to support data=journal under LBS, we now set max folio order to be equal to min folio order for files with the journalling flag. When LBS is disabled, the max folio order remains unset as before. Therefore, before ext4_change_inode_journal_flag() switches the journalling mode, we call truncate_pagecache() to drop all page cache for that inode, and filemap_write_and_wait() is called unconditionally. After that, once the journalling mode has been switched, we can safely reset the inode mapping order, and the mapping_large_folio_support() check in ext4_inode_journal_mode() can be removed. Suggested-by: Jan Kara Suggested-by: Dan Carpenter Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-22-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 3 +-- fs/ext4/inode.c | 33 +++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index a0e66bc10093..05e5946ed9b3 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC) && - !mapping_large_folio_support(inode->i_mapping))) { + !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3b6f66463add..1eab837e47c5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5154,9 +5154,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!S_ISREG(inode->i_mode)) return false; - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return false; if (ext4_has_feature_verity(sb)) return false; if (ext4_has_feature_encrypt(sb)) @@ -5174,12 +5171,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode) umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) void ext4_set_inode_mapping_order(struct inode *inode) { + u32 max_order; + if (!ext4_should_enable_large_folio(inode)) return; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; + else + max_order = EXT4_MAX_PAGECACHE_ORDER(inode); + mapping_set_folio_order_range(inode->i_mapping, EXT4_SB(inode->i_sb)->s_min_folio_order, - EXT4_MAX_PAGECACHE_ORDER(inode)); + max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -6554,14 +6559,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ - if (val) { - filemap_invalidate_lock(inode->i_mapping); - err = filemap_write_and_wait(inode->i_mapping); - if (err < 0) { - filemap_invalidate_unlock(inode->i_mapping); - return err; - } + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + filemap_invalidate_unlock(inode->i_mapping); + return err; } + /* Before switch the inode journalling mode evict all the page cache. */ + truncate_pagecache(inode, 0); alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); @@ -6581,17 +6586,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); + filemap_invalidate_unlock(inode->i_mapping); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + ext4_set_inode_mapping_order(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); - - if (val) - filemap_invalidate_unlock(inode->i_mapping); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ From 1a3e9e8aa4f72440b00ef6171b4198f82822d679 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:52 +0800 Subject: [PATCH 55/58] ext4: support verifying data from large folios with fs-verity Eric Biggers already added support for verifying data from large folios several years ago in commit 5d0f0e57ed90 ("fsverity: support verifying data from large folios"). With ext4 now supporting large block sizes, the fs-verity tests `kvm-xfstests -c ext4/64k -g verity -x encrypt` pass without issues. Therefore, remove the restriction and allow large folios to be enabled together with fs-verity. Cc: Eric Biggers Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-23-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1eab837e47c5..a566469ae07a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5154,8 +5154,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!S_ISREG(inode->i_mode)) return false; - if (ext4_has_feature_verity(sb)) - return false; if (ext4_has_feature_encrypt(sb)) return false; From 709f0f1f1bf5ca62a000084e5446ca6b57c8678c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:53 +0800 Subject: [PATCH 56/58] ext4: add checks for large folio incompatibilities when BS > PS Supporting a block size greater than the page size (BS > PS) requires support for large folios. However, several features (e.g., encrypt) do not yet support large folios. To prevent conflicts, this patch adds checks at mount time to prohibit these features from being used when BS > PS. Since these features cannot be changed on remount, there is no need to check on remount. This patch adds s_max_folio_order, initialized during mount according to filesystem features and mount options. If s_max_folio_order is 0, large folios are disabled. With this in place, ext4_set_inode_mapping_order() can be simplified by checking s_max_folio_order, avoiding redundant checks. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-24-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- fs/ext4/inode.c | 40 +++++++++++----------------------------- fs/ext4/super.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5ade4e22dcff..56112f201cac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1705,7 +1705,9 @@ struct ext4_sb_info { unsigned long s_last_trim_minblks; /* minimum folio order of a page cache allocation */ - unsigned int s_min_folio_order; + u16 s_min_folio_order; + /* supported maximum folio order, 0 means not supported */ + u16 s_max_folio_order; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a566469ae07a..7510fce3d0f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5148,41 +5148,23 @@ error: return -EFSCORRUPTED; } -static bool ext4_should_enable_large_folio(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - if (!S_ISREG(inode->i_mode)) - return false; - if (ext4_has_feature_encrypt(sb)) - return false; - - return true; -} - -/* - * Limit the maximum folio order to 2048 blocks to prevent overestimation - * of reserve handle credits during the folio writeback in environments - * where the PAGE_SIZE exceeds 4KB. - */ -#define EXT4_MAX_PAGECACHE_ORDER(i) \ - umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) void ext4_set_inode_mapping_order(struct inode *inode) { - u32 max_order; + struct super_block *sb = inode->i_sb; + u16 min_order, max_order; - if (!ext4_should_enable_large_folio(inode)) + max_order = EXT4_SB(sb)->s_max_folio_order; + if (!max_order) return; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; - else - max_order = EXT4_MAX_PAGECACHE_ORDER(inode); + min_order = EXT4_SB(sb)->s_min_folio_order; + if (!min_order && !S_ISREG(inode->i_mode)) + return; - mapping_set_folio_order_range(inode->i_mapping, - EXT4_SB(inode->i_sb)->s_min_folio_order, - max_order); + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = min_order; + + mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 998ddce49093..3d03a6eb0ea0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5042,6 +5042,41 @@ static const char *ext4_has_journal_option(struct super_block *sb) return NULL; } +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(sb) \ + umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT)) +static void ext4_set_max_mapping_order(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + sbi->s_max_folio_order = sbi->s_min_folio_order; + else + sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb); +} + +static int ext4_check_large_folio(struct super_block *sb) +{ + const char *err_str = NULL; + + if (ext4_has_feature_encrypt(sb)) + err_str = "encrypt"; + + if (!err_str) { + ext4_set_max_mapping_order(sb); + } else if (sb->s_blocksize > PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s", + sb->s_blocksize, PAGE_SIZE, err_str); + return -EINVAL; + } + + return 0; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5318,6 +5353,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); + err = ext4_check_large_folio(sb); + if (err < 0) + goto failed_mount; + err = ext4_encoding_init(sb, es); if (err) goto failed_mount; From cab8cbcb923a89cb583c9088fa50431eb2feded5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:54 +0800 Subject: [PATCH 57/58] ext4: enable block size larger than page size Since block device (See commit 3c20917120ce ("block/bdev: enable large folio support for large logical block sizes")) and page cache (See commit ab95d23bab220ef8 ("filemap: allocate mapping_min_order folios in the page cache")) has the ability to have a minimum order when allocating folio, and ext4 has supported large folio in commit 7ac67301e82f ("ext4: enable large folio for regular file"), now add support for block_size > PAGE_SIZE in ext4. set_blocksize() -> bdev_validate_blocksize() already validates the block size, so ext4_load_super() does not need to perform additional checks. Here we only need to add the FS_LBS bit to fs_flags. In addition, block sizes larger than the page size are currently supported only when CONFIG_TRANSPARENT_HUGEPAGE is enabled. To make this explicit, a blocksize_gt_pagesize entry has been added under /sys/fs/ext4/feature/, indicating whether bs > ps is supported. This allows mke2fs to check the interface and determine whether a warning should be issued when formatting a filesystem with block size larger than the page size. Suggested-by: Theodore Ts'o Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Pankaj Raghav Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-25-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 ++- fs/ext4/sysfs.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3d03a6eb0ea0..87205660c5d0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7453,7 +7453,8 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 987bd00f916a..0018e09b867e 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -332,6 +332,9 @@ EXT4_ATTR_FEATURE(fast_commit); #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +EXT4_ATTR_FEATURE(blocksize_gt_pagesize); +#endif static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -351,6 +354,9 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(fast_commit), #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ATTR_LIST(blocksize_gt_pagesize), #endif NULL, }; From 91ef18b567dae84c0cea9b996d933c856e366f52 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Nov 2025 11:13:41 +0100 Subject: [PATCH 58/58] ext4: mark inodes without acls in __ext4_iget() Mark inodes without acls with cache_no_acl() in __ext4_iget() so that path lookup can run in RCU mode from the start. This is interesting in particular for the case where the file owner does the lookup because in that case end up constantly hitting the slow path otherwise. We drop out from the fast path (because ACL state is unknown) but never end up calling check_acl() to cache ACL state. The problem was originally analyzed by Linus and fix tested by Matheusz, I'm just putting it into mergeable form :). Link: https://lore.kernel.org/all/CAHk-=whSzc75TLLPWskV0xuaHR4tpWBr=LduqhcCFr4kCmme_w@mail.gmail.com Reported-by: Mateusz Guzik Reported-by: Linus Torvalds Signed-off-by: Jan Kara Reviewed-by: Baokun Li Message-ID: <20251125101340.24276-2-jack@suse.cz> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7510fce3d0f0..eeb3ec4c2a9a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5508,7 +5508,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (ret) goto bad_inode; brelse(iloc.bh); - + /* Initialize the "no ACL's" state for the simple cases */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl) + cache_no_acl(inode); unlock_new_inode(inode); return inode;