From 0ec73eb3f12350799c4b3fb764225f6e38b42d1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Nov 2025 05:14:09 -0500 Subject: [PATCH 01/33] xfs: add a xfs_groups_to_rfsbs helper Plus a rtgroup wrapper and use that to avoid overflows when converting zone/rtg counts to block counts. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_group.h | 9 +++++++++ fs/xfs/libxfs/xfs_rtgroup.h | 8 ++++++++ fs/xfs/xfs_zone_gc.c | 3 +-- fs/xfs/xfs_zone_space_resv.c | 8 +++----- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 4423932a2313..4ae638f1c2c5 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -98,6 +98,15 @@ xfs_group_max_blocks( return xg->xg_mount->m_groups[xg->xg_type].blocks; } +static inline xfs_rfsblock_t +xfs_groups_to_rfsbs( + struct xfs_mount *mp, + uint32_t nr_groups, + enum xfs_group_type type) +{ + return (xfs_rfsblock_t)mp->m_groups[type].blocks * nr_groups; +} + static inline xfs_fsblock_t xfs_group_start_fsb( struct xfs_group *xg) diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index d4fcf591e63d..a94e925ae67c 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -371,4 +371,12 @@ static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, # define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ +static inline xfs_rfsblock_t +xfs_rtgs_to_rfsbs( + struct xfs_mount *mp, + uint32_t nr_groups) +{ + return xfs_groups_to_rfsbs(mp, nr_groups, XG_TYPE_RTG); +} + #endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 4ade54445532..a98939aba7b9 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -181,8 +181,7 @@ xfs_zoned_need_gc( available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); if (available < - mp->m_groups[XG_TYPE_RTG].blocks * - (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c index 9cd38716fd25..0e54e557a585 100644 --- a/fs/xfs/xfs_zone_space_resv.c +++ b/fs/xfs/xfs_zone_space_resv.c @@ -54,12 +54,10 @@ xfs_zoned_default_resblks( { switch (ctr) { case XC_FREE_RTEXTENTS: - return (uint64_t)XFS_RESERVED_ZONES * - mp->m_groups[XG_TYPE_RTG].blocks + - mp->m_sb.sb_rtreserved; + return xfs_rtgs_to_rfsbs(mp, XFS_RESERVED_ZONES) + + mp->m_sb.sb_rtreserved; case XC_FREE_RTAVAILABLE: - return (uint64_t)XFS_GC_ZONES * - mp->m_groups[XG_TYPE_RTG].blocks; + return xfs_rtgs_to_rfsbs(mp, XFS_GC_ZONES); default: ASSERT(0); return 0; From 204c8f77e8d4a3006f8abe40331f221a597ce608 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:53 +0100 Subject: [PATCH 02/33] xfs: don't leak a locked dquot when xfs_dquot_attach_buf fails xfs_qm_quotacheck_dqadjust acquired the dquot through xfs_qm_dqget, which means it owns a reference and holds q_qlock. Both need to be dropped on an error exit. Cc: # v6.13 Fixes: ca378189fdfa ("xfs: convert quotacheck to attach dquot buffers") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 23ba84ec919a..18a19947bbdb 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1318,7 +1318,7 @@ xfs_qm_quotacheck_dqadjust( error = xfs_dquot_attach_buf(NULL, dqp); if (error) - return error; + goto out_unlock; trace_xfs_dqadjust(dqp); @@ -1348,8 +1348,9 @@ xfs_qm_quotacheck_dqadjust( } dqp->q_flags |= XFS_DQFLAG_DIRTY; +out_unlock: xfs_qm_dqput(dqp); - return 0; + return error; } /* From 005d5ae0c585e11d31df1e721c04f113a8281443 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:54 +0100 Subject: [PATCH 03/33] xfs: make qi_dquots a 64-bit value qi_dquots counts all quotas in the file system, which can be up to 3 * UINT_MAX and overflow a 32-bit counter, but can't be negative. Make qi_dquots a uint64_t, and saturate the value to UINT_MAX for userspace reporting. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm.h | 2 +- fs/xfs/xfs_quotaops.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 35b64bc3a7a8..e88ed6ad0e65 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -57,7 +57,7 @@ struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; - int qi_dquots; + uint64_t qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dq in above chunk */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 4c7f7ce4fd2f..94fbe3d99ec7 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -65,7 +65,7 @@ xfs_fs_get_quota_state( memset(state, 0, sizeof(*state)); if (!XFS_IS_QUOTA_ON(mp)) return 0; - state->s_incoredqs = q->qi_dquots; + state->s_incoredqs = min_t(uint64_t, q->qi_dquots, UINT_MAX); if (XFS_IS_UQUOTA_ON(mp)) state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_UQUOTA_ENFORCED(mp)) From 36cebabde7866c30b71ecd074e3773dbd768a1d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:55 +0100 Subject: [PATCH 04/33] xfs: don't treat all radix_tree_insert errors as -EEXIST Return other errors to the caller instead. Note that there really shouldn't be any other errors because the entry is preallocated, but if there were, we'd better return them instead of retrying forever. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0bd8022e47b4..79e14ee1d7a0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -860,7 +860,6 @@ xfs_qm_dqget_cache_insert( mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { - /* Duplicate found! Caller must try again. */ trace_xfs_dqget_dup(dqp); goto out_unlock; } @@ -935,13 +934,16 @@ restart: error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); if (error) { - /* - * Duplicate found. Just throw away the new dquot and start - * over. - */ xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(mp, xs_qm_dquot_dups); - goto restart; + if (error == -EEXIST) { + /* + * Duplicate found. Just throw away the new dquot and + * start over. + */ + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; + } + return error; } trace_xfs_dqget_miss(dqp); @@ -1060,13 +1062,16 @@ restart: error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); if (error) { - /* - * Duplicate found. Just throw away the new dquot and start - * over. - */ xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(mp, xs_qm_dquot_dups); - goto restart; + if (error == -EEXIST) { + /* + * Duplicate found. Just throw away the new dquot and + * start over. + */ + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; + } + return error; } dqret: From 6129b088e1f10938b86f44948ad698b39dd19faa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:56 +0100 Subject: [PATCH 05/33] xfs: remove xfs_dqunlock and friends There's really no point in wrapping the basic mutex operations. Remove the wrapper to ease lock analysis annotations and make the code a litte easier to read. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quota.c | 4 ++-- fs/xfs/scrub/quota_repair.c | 6 +++--- fs/xfs/scrub/quotacheck_repair.c | 8 ++++---- fs/xfs/xfs_dquot.c | 14 +++++++------- fs/xfs/xfs_dquot.h | 19 ++----------------- fs/xfs/xfs_dquot_item.c | 6 +++--- fs/xfs/xfs_qm.c | 30 +++++++++++++++--------------- fs/xfs/xfs_qm_syscalls.c | 4 ++-- fs/xfs/xfs_trans_dquot.c | 18 +++++++++--------- 9 files changed, 47 insertions(+), 62 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 58d6d4ed2853..c78cf9f96cf6 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -158,9 +158,9 @@ xchk_quota_item( * However, dqiterate gave us a locked dquot, so drop the dquot lock to * get the ILOCK. */ - xfs_dqunlock(dq); + mutex_unlock(&dq->q_qlock); xchk_ilock(sc, XFS_ILOCK_SHARED); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); /* * Except for the root dquot, the actual dquot we got must either have diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 8f4c8d41f308..8c89c6cc2950 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -187,9 +187,9 @@ xrep_quota_item( * dqiterate gave us a locked dquot, so drop the dquot lock to get the * ILOCK_EXCL. */ - xfs_dqunlock(dq); + mutex_unlock(&dq->q_qlock); xchk_ilock(sc, XFS_ILOCK_EXCL); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); error = xrep_quota_item_bmap(sc, dq, &dirty); xchk_iunlock(sc, XFS_ILOCK_EXCL); @@ -258,7 +258,7 @@ xrep_quota_item( } xfs_trans_log_dquot(sc->tp, dq); error = xfs_trans_roll(&sc->tp); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); return error; } diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index dd8554c755b5..415314911499 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -53,9 +53,9 @@ xqcheck_commit_dquot( int error = 0; /* Unlock the dquot just long enough to allocate a transaction. */ - xfs_dqunlock(dq); + mutex_unlock(&dq->q_qlock); error = xchk_trans_alloc(xqc->sc, 0); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); if (error) return error; @@ -122,7 +122,7 @@ xqcheck_commit_dquot( * dquot). */ error = xrep_trans_commit(xqc->sc); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); return error; out_unlock: @@ -131,7 +131,7 @@ out_cancel: xchk_trans_cancel(xqc->sc); /* Re-lock the dquot so the caller can put the reference. */ - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); return error; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 79e14ee1d7a0..c2326cee7fae 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -31,7 +31,7 @@ * * ip->i_lock * qi->qi_tree_lock - * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_qlock * dquot->q_flush (xfs_dqflock() and friends) * qi->qi_lru_lock * @@ -816,9 +816,9 @@ restart: return NULL; } - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); if (dqp->q_flags & XFS_DQFLAG_FREEING) { - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); @@ -865,7 +865,7 @@ xfs_qm_dqget_cache_insert( } /* Return a locked dquot to the caller, with a reference taken. */ - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); dqp->q_nrefs = 1; qi->qi_dquots++; @@ -1051,7 +1051,7 @@ restart: if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); goto dqret; } } else { @@ -1136,7 +1136,7 @@ xfs_qm_dqput( if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } /* @@ -1152,7 +1152,7 @@ xfs_qm_dqrele( trace_xfs_dqrele(dqp); - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); /* * We don't care to flush it if the dquot is dirty here. * That will create stutters that we want to avoid. diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 61217adf5ba5..10c39b8cdd03 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -121,21 +121,6 @@ static inline void xfs_dqfunlock(struct xfs_dquot *dqp) complete(&dqp->q_flush); } -static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -static inline void xfs_dqlock(struct xfs_dquot *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - -static inline void xfs_dqunlock(struct xfs_dquot *dqp) -{ - mutex_unlock(&dqp->q_qlock); -} - static inline int xfs_dquot_type(const struct xfs_dquot *dqp) { @@ -246,9 +231,9 @@ void xfs_dquot_detach_buf(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); dqp->q_nrefs++; - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return dqp; } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 271b195ebb93..b374cd9f1900 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -132,7 +132,7 @@ xfs_qm_dquot_logitem_push( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_dqlock_nowait(dqp)) + if (!mutex_trylock(&dqp->q_qlock)) return XFS_ITEM_LOCKED; /* @@ -177,7 +177,7 @@ xfs_qm_dquot_logitem_push( out_relock_ail: spin_lock(&lip->li_ailp->ail_lock); out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return rval; } @@ -195,7 +195,7 @@ xfs_qm_dquot_logitem_release( * transaction layer, within trans_commit. Hence, no LI_HOLD flag * for the logitem. */ - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } STATIC void diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 18a19947bbdb..3e88bea9a465 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -128,7 +128,7 @@ xfs_qm_dqpurge( struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; int error = -EAGAIN; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) goto out_unlock; @@ -177,7 +177,7 @@ out_funlock: !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; @@ -194,7 +194,7 @@ out_funlock: return 0; out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return error; } @@ -329,7 +329,7 @@ xfs_qm_dqattach_one( * that the dquot returned is the one that should go in the inode. */ *IO_idqpp = dqp; - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return 0; } @@ -468,7 +468,7 @@ xfs_qm_dquot_isolate( struct xfs_qm_isolate *isol = arg; enum lru_status ret = LRU_SKIP; - if (!xfs_dqlock_nowait(dqp)) + if (!mutex_trylock(&dqp->q_qlock)) goto out_miss_busy; /* @@ -494,7 +494,7 @@ xfs_qm_dquot_isolate( * the freelist and try again. */ if (dqp->q_nrefs) { - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); @@ -519,7 +519,7 @@ xfs_qm_dquot_isolate( * Prevent lookups now that we are past the point of no return. */ dqp->q_flags |= XFS_DQFLAG_FREEING; - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); @@ -529,7 +529,7 @@ xfs_qm_dquot_isolate( return LRU_REMOVED; out_miss_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); @@ -1467,7 +1467,7 @@ xfs_qm_flush_one( struct xfs_buf *bp = NULL; int error = 0; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); if (dqp->q_flags & XFS_DQFLAG_FREEING) goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) @@ -1489,7 +1489,7 @@ xfs_qm_flush_one( xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return error; } @@ -1952,7 +1952,7 @@ xfs_qm_vop_dqalloc( /* * Get the ilock in the right order. */ - xfs_dqunlock(uq); + mutex_unlock(&uq->q_qlock); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { @@ -1974,7 +1974,7 @@ xfs_qm_vop_dqalloc( ASSERT(error != -ENOENT); goto error_rele; } - xfs_dqunlock(gq); + mutex_unlock(&gq->q_qlock); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { @@ -1992,7 +1992,7 @@ xfs_qm_vop_dqalloc( ASSERT(error != -ENOENT); goto error_rele; } - xfs_dqunlock(pq); + mutex_unlock(&pq->q_qlock); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { @@ -2079,7 +2079,7 @@ xfs_qm_vop_chown( * back now. */ tp->t_flags |= XFS_TRANS_DIRTY; - xfs_dqlock(prevdq); + mutex_lock(&prevdq->q_qlock); if (isrt) { ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks); prevdq->q_rtb.reserved -= ip->i_delayed_blks; @@ -2087,7 +2087,7 @@ xfs_qm_vop_chown( ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); prevdq->q_blk.reserved -= ip->i_delayed_blks; } - xfs_dqunlock(prevdq); + mutex_unlock(&prevdq->q_qlock); /* * Take an extra reference, because the inode is going to keep diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 0c78f30fa4a3..59ef382900fe 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -303,13 +303,13 @@ xfs_qm_scall_setqlim( } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); if (error) goto out_rele; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); xfs_trans_dqjoin(tp, dqp); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 765456bf3428..c842ce06acd6 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -393,7 +393,7 @@ xfs_trans_dqlockedjoin( unsigned int i; ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { - xfs_dqlock(q[0].qt_dquot); + mutex_lock(&q[0].qt_dquot->q_qlock); xfs_trans_dqjoin(tp, q[0].qt_dquot); } else if (q[2].qt_dquot == NULL) { xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); @@ -693,7 +693,7 @@ xfs_trans_unreserve_and_mod_dquots( locked = already_locked; if (qtrx->qt_blk_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_blk.reserved -= @@ -701,7 +701,7 @@ xfs_trans_unreserve_and_mod_dquots( } if (qtrx->qt_ino_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_ino.reserved -= @@ -710,14 +710,14 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } if (locked && !already_locked) - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } } @@ -820,7 +820,7 @@ xfs_trans_dqresv( struct xfs_dquot_res *blkres; struct xfs_quota_limits *qlim; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -887,16 +887,16 @@ xfs_trans_dqresv( XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count)) goto error_corrupt; - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return 0; error_return: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; error_corrupt: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); return -EFSCORRUPTED; From 0c5e80bd579f7bec3704bad6c1f72b13b0d73b53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:57 +0100 Subject: [PATCH 06/33] xfs: use a lockref for the xfs_dquot reference count The xfs_dquot structure currently uses the anti-pattern of using the in-object lock that protects the content to also serialize reference count updates for the structure, leading to a cumbersome free path. This is partially papered over by the fact that we never free the dquot directly but always through the LRU. Switch to use a lockref instead and move the reference counter manipulations out of q_qlock. To make this work, xfs_qm_flush_one and xfs_qm_flush_one are converted to acquire a dquot reference while flushing to integrate with the lockref "get if not dead" scheme. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_quota_defs.h | 4 +-- fs/xfs/xfs_dquot.c | 17 ++++++------ fs/xfs/xfs_dquot.h | 6 ++-- fs/xfs/xfs_qm.c | 50 ++++++++++++++++------------------ fs/xfs/xfs_trace.h | 2 +- 5 files changed, 37 insertions(+), 42 deletions(-) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 763d941a8420..551d7ae46c5c 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -29,11 +29,9 @@ typedef uint8_t xfs_dqtype_t; * flags for q_flags field in the dquot. */ #define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ - { XFS_DQFLAG_DIRTY, "DIRTY" }, \ - { XFS_DQFLAG_FREEING, "FREEING" } + { XFS_DQFLAG_DIRTY, "DIRTY" } /* * We have the possibility of all three quota types being active at once, and diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c2326cee7fae..34c325524ab9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -816,20 +816,17 @@ restart: return NULL; } - mutex_lock(&dqp->q_qlock); - if (dqp->q_flags & XFS_DQFLAG_FREEING) { - mutex_unlock(&dqp->q_qlock); + if (!lockref_get_not_dead(&dqp->q_lockref)) { mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); goto restart; } - - dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); XFS_STATS_INC(mp, xs_qm_dqcachehits); + mutex_lock(&dqp->q_qlock); return dqp; } @@ -866,7 +863,7 @@ xfs_qm_dqget_cache_insert( /* Return a locked dquot to the caller, with a reference taken. */ mutex_lock(&dqp->q_qlock); - dqp->q_nrefs = 1; + lockref_init(&dqp->q_lockref); qi->qi_dquots++; out_unlock: @@ -1124,18 +1121,22 @@ void xfs_qm_dqput( struct xfs_dquot *dqp) { - ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); trace_xfs_dqput(dqp); - if (--dqp->q_nrefs == 0) { + if (lockref_put_or_lock(&dqp->q_lockref)) + goto out_unlock; + + if (!--dqp->q_lockref.count) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } + spin_unlock(&dqp->q_lockref.lock); +out_unlock: mutex_unlock(&dqp->q_qlock); } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 10c39b8cdd03..c56fbc39d089 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -71,7 +71,7 @@ struct xfs_dquot { xfs_dqtype_t q_type; uint16_t q_flags; xfs_dqid_t q_id; - uint q_nrefs; + struct lockref q_lockref; int q_bufoffset; xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; @@ -231,9 +231,7 @@ void xfs_dquot_detach_buf(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { - mutex_lock(&dqp->q_qlock); - dqp->q_nrefs++; - mutex_unlock(&dqp->q_qlock); + lockref_get(&dqp->q_lockref); return dqp; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 3e88bea9a465..f98f9fdac0b5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -126,14 +126,16 @@ xfs_qm_dqpurge( void *data) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - int error = -EAGAIN; + + spin_lock(&dqp->q_lockref.lock); + if (dqp->q_lockref.count > 0 || __lockref_is_dead(&dqp->q_lockref)) { + spin_unlock(&dqp->q_lockref.lock); + return -EAGAIN; + } + lockref_mark_dead(&dqp->q_lockref); + spin_unlock(&dqp->q_lockref.lock); mutex_lock(&dqp->q_qlock); - if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) - goto out_unlock; - - dqp->q_flags |= XFS_DQFLAG_FREEING; - xfs_qm_dqunpin_wait(dqp); xfs_dqflock(dqp); @@ -144,6 +146,7 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; + int error; /* * We don't care about getting disk errors here. We need @@ -151,9 +154,9 @@ xfs_qm_dqpurge( */ error = xfs_dquot_use_attached_buf(dqp, &bp); if (error == -EAGAIN) { - xfs_dqfunlock(dqp); - dqp->q_flags &= ~XFS_DQFLAG_FREEING; - goto out_unlock; + /* resurrect the refcount from the dead. */ + dqp->q_lockref.count = 0; + goto out_funlock; } if (!bp) goto out_funlock; @@ -192,10 +195,6 @@ out_funlock: xfs_qm_dqdestroy(dqp); return 0; - -out_unlock: - mutex_unlock(&dqp->q_qlock); - return error; } /* @@ -468,7 +467,7 @@ xfs_qm_dquot_isolate( struct xfs_qm_isolate *isol = arg; enum lru_status ret = LRU_SKIP; - if (!mutex_trylock(&dqp->q_qlock)) + if (!spin_trylock(&dqp->q_lockref.lock)) goto out_miss_busy; /* @@ -476,7 +475,7 @@ xfs_qm_dquot_isolate( * from the LRU, leave it for the freeing task to complete the freeing * process rather than risk it being free from under us here. */ - if (dqp->q_flags & XFS_DQFLAG_FREEING) + if (__lockref_is_dead(&dqp->q_lockref)) goto out_miss_unlock; /* @@ -485,16 +484,15 @@ xfs_qm_dquot_isolate( * again. */ ret = LRU_ROTATE; - if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) { + if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) goto out_miss_unlock; - } /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ - if (dqp->q_nrefs) { - mutex_unlock(&dqp->q_qlock); + if (dqp->q_lockref.count) { + spin_unlock(&dqp->q_lockref.lock); XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); @@ -518,10 +516,9 @@ xfs_qm_dquot_isolate( /* * Prevent lookups now that we are past the point of no return. */ - dqp->q_flags |= XFS_DQFLAG_FREEING; - mutex_unlock(&dqp->q_qlock); + lockref_mark_dead(&dqp->q_lockref); + spin_unlock(&dqp->q_lockref.lock); - ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); @@ -529,7 +526,7 @@ xfs_qm_dquot_isolate( return LRU_REMOVED; out_miss_unlock: - mutex_unlock(&dqp->q_qlock); + spin_unlock(&dqp->q_lockref.lock); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); @@ -1467,9 +1464,10 @@ xfs_qm_flush_one( struct xfs_buf *bp = NULL; int error = 0; + if (!lockref_get_not_dead(&dqp->q_lockref)) + return 0; + mutex_lock(&dqp->q_qlock); - if (dqp->q_flags & XFS_DQFLAG_FREEING) - goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; @@ -1489,7 +1487,7 @@ xfs_qm_flush_one( xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: - mutex_unlock(&dqp->q_qlock); + xfs_qm_dqput(dqp); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 79b8641880ab..46d21eb11ccb 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1350,7 +1350,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->id = dqp->q_id; __entry->type = dqp->q_type; __entry->flags = dqp->q_flags; - __entry->nrefs = dqp->q_nrefs; + __entry->nrefs = data_race(dqp->q_lockref.count); __entry->res_bcount = dqp->q_blk.reserved; __entry->res_rtbcount = dqp->q_rtb.reserved; From 6b6e6e75211687c61c5660f65b4155cd0eb7e187 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:58 +0100 Subject: [PATCH 07/33] xfs: remove xfs_qm_dqput and optimize dropping dquot references With the new lockref-based dquot reference counting, there is no need to hold q_qlock for dropping the reference. Make xfs_qm_dqrele the main function to drop dquot references without taking q_qlock and convert all callers of xfs_qm_dqput to unlock q_qlock and call xfs_qm_dqrele instead. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quota.c | 3 +- fs/xfs/scrub/quota_repair.c | 3 +- fs/xfs/scrub/quotacheck.c | 6 ++-- fs/xfs/scrub/quotacheck_repair.c | 6 ++-- fs/xfs/xfs_dquot.c | 53 ++++++++------------------------ fs/xfs/xfs_dquot.h | 1 - fs/xfs/xfs_qm.c | 6 ++-- fs/xfs/xfs_qm_bhv.c | 3 +- fs/xfs/xfs_qm_syscalls.c | 6 ++-- fs/xfs/xfs_trace.h | 3 +- 10 files changed, 36 insertions(+), 54 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index c78cf9f96cf6..cfcd0fb66845 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -330,7 +330,8 @@ xchk_quota( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xchk_quota_item(&sqi, dq); - xfs_qm_dqput(dq); + mutex_unlock(&dq->q_qlock); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 8c89c6cc2950..d4ce9e56d3ef 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -513,7 +513,8 @@ xrep_quota_problems( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xrep_quota_item(&rqi, dq); - xfs_qm_dqput(dq); + mutex_unlock(&dq->q_qlock); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index e4105aaafe84..180449f654f6 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -636,7 +636,8 @@ xqcheck_walk_observations( return error; error = xqcheck_compare_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + mutex_unlock(&dq->q_qlock); + xfs_qm_dqrele(dq); if (error) return error; @@ -674,7 +675,8 @@ xqcheck_compare_dqtype( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xqcheck_compare_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + mutex_unlock(&dq->q_qlock); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index 415314911499..11153e24b565 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -156,7 +156,8 @@ xqcheck_commit_dqtype( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xqcheck_commit_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + mutex_unlock(&dq->q_qlock); + xfs_qm_dqrele(dq); if (error) break; } @@ -187,7 +188,8 @@ xqcheck_commit_dqtype( return error; error = xqcheck_commit_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + mutex_unlock(&dq->q_qlock); + xfs_qm_dqrele(dq); if (error) return error; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 34c325524ab9..52c521a1402d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1105,44 +1105,15 @@ xfs_qm_dqget_next( return 0; } - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); } return error; } /* - * Release a reference to the dquot (decrement ref-count) and unlock it. - * - * If there is a group quota attached to this dquot, carefully release that - * too without tripping over deadlocks'n'stuff. - */ -void -xfs_qm_dqput( - struct xfs_dquot *dqp) -{ - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - trace_xfs_dqput(dqp); - - if (lockref_put_or_lock(&dqp->q_lockref)) - goto out_unlock; - - if (!--dqp->q_lockref.count) { - struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - trace_xfs_dqput_free(dqp); - - if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) - XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); - } - spin_unlock(&dqp->q_lockref.lock); -out_unlock: - mutex_unlock(&dqp->q_qlock); -} - -/* - * Release a dquot. Flush it if dirty, then dqput() it. - * dquot must not be locked. + * Release a reference to the dquot. */ void xfs_qm_dqrele( @@ -1153,14 +1124,16 @@ xfs_qm_dqrele( trace_xfs_dqrele(dqp); - mutex_lock(&dqp->q_qlock); - /* - * We don't care to flush it if the dquot is dirty here. - * That will create stutters that we want to avoid. - * Instead we do a delayed write when we try to reclaim - * a dirty dquot. Also xfs_sync will take part of the burden... - */ - xfs_qm_dqput(dqp); + if (lockref_put_or_lock(&dqp->q_lockref)) + return; + if (!--dqp->q_lockref.count) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + + trace_xfs_dqrele_free(dqp); + if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); + } + spin_unlock(&dqp->q_lockref.lock); } /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c56fbc39d089..bbb824adca82 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -218,7 +218,6 @@ int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, int xfs_qm_dqget_uncached(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp); -void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); void xfs_dqlockn(struct xfs_dqtrx *q); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f98f9fdac0b5..5e6aefb17f19 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1346,7 +1346,8 @@ xfs_qm_quotacheck_dqadjust( dqp->q_flags |= XFS_DQFLAG_DIRTY; out_unlock: - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } @@ -1487,7 +1488,8 @@ xfs_qm_flush_one( xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 245d754f382a..e5a30b12253c 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -74,7 +74,8 @@ xfs_qm_statvfs( if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, ip, dqp); - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); } } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 59ef382900fe..441f9806cddb 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -467,7 +467,8 @@ xfs_qm_scall_getquota( xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); out_put: - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } @@ -497,7 +498,8 @@ xfs_qm_scall_getquota_next( *id = dqp->q_id; xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + mutex_unlock(&dqp->q_qlock); - xfs_qm_dqput(dqp); + xfs_qm_dqrele(dqp); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 46d21eb11ccb..fccc032b3c6c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1409,9 +1409,8 @@ DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); -DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqrele_free); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); From 0494f04643de72e13acd556e402cc4edc6169950 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:22:59 +0100 Subject: [PATCH 08/33] xfs: consolidate q_qlock locking in xfs_qm_dqget and xfs_qm_dqget_inode Move taking q_qlock from the cache lookup / insert helpers into the main functions and do it just before returning to the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 52c521a1402d..8b4434e6df09 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -826,15 +826,13 @@ restart: trace_xfs_dqget_hit(dqp); XFS_STATS_INC(mp, xs_qm_dqcachehits); - mutex_lock(&dqp->q_qlock); return dqp; } /* * Try to insert a new dquot into the in-core cache. If an error occurs the * caller should throw away the dquot and start over. Otherwise, the dquot - * is returned locked (and held by the cache) as if there had been a cache - * hit. + * is returned (and held by the cache) as if there had been a cache hit. * * The insert needs to be done under memalloc_nofs context because the radix * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in @@ -861,8 +859,6 @@ xfs_qm_dqget_cache_insert( goto out_unlock; } - /* Return a locked dquot to the caller, with a reference taken. */ - mutex_lock(&dqp->q_qlock); lockref_init(&dqp->q_lockref); qi->qi_dquots++; @@ -920,10 +916,8 @@ xfs_qm_dqget( restart: dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); - if (dqp) { - *O_dqpp = dqp; - return 0; - } + if (dqp) + goto found; error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); if (error) @@ -944,7 +938,9 @@ restart: } trace_xfs_dqget_miss(dqp); +found: *O_dqpp = dqp; + mutex_lock(&dqp->q_qlock); return 0; } @@ -1019,10 +1015,8 @@ xfs_qm_dqget_inode( restart: dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); - if (dqp) { - *O_dqpp = dqp; - return 0; - } + if (dqp) + goto found; /* * Dquot cache miss. We don't want to keep the inode lock across @@ -1048,7 +1042,6 @@ restart: if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; - mutex_lock(&dqp->q_qlock); goto dqret; } } else { @@ -1074,7 +1067,9 @@ restart: dqret: xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); +found: *O_dqpp = dqp; + mutex_lock(&dqp->q_qlock); return 0; } From d0f93c0d7c9dc8f7fdbd1ce3f5d3bfd8e109da65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:00 +0100 Subject: [PATCH 09/33] xfs: xfs_qm_dqattach_one is never called with a non-NULL *IO_idqpp The caller already checks that, so replace the handling of this case with an assert that it does not happen. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm.c | 13 +------------ fs/xfs/xfs_trace.h | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5e6aefb17f19..b571eff51694 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -297,19 +297,8 @@ xfs_qm_dqattach_one( struct xfs_dquot *dqp; int error; + ASSERT(!*IO_idqpp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - error = 0; - - /* - * See if we already have it in the inode itself. IO_idqpp is &i_udquot - * or &i_gdquot. This made the code look weird, but made the logic a lot - * simpler. - */ - dqp = *IO_idqpp; - if (dqp) { - trace_xfs_dqattach_found(dqp); - return 0; - } /* * Find the dquot from somewhere. This bumps the reference count of diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fccc032b3c6c..90582ff7c2cf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1399,7 +1399,6 @@ DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); -DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); From bf5066e169eed0b7b705e3261a05db80f1b8358e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:01 +0100 Subject: [PATCH 10/33] xfs: fold xfs_qm_dqattach_one into xfs_qm_dqget_inode xfs_qm_dqattach_one is a thin wrapper around xfs_qm_dqget_inode. Move the extra asserts into xfs_qm_dqget_inode, drop the unneeded q_qlock roundtrip and merge the two functions. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot.c | 9 ++++++--- fs/xfs/xfs_qm.c | 40 +++------------------------------------- 2 files changed, 9 insertions(+), 40 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8b4434e6df09..862fec529512 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -994,7 +994,7 @@ xfs_qm_dqget_inode( struct xfs_inode *ip, xfs_dqtype_t type, bool can_alloc, - struct xfs_dquot **O_dqpp) + struct xfs_dquot **dqpp) { struct xfs_mount *mp = ip->i_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -1003,6 +1003,9 @@ xfs_qm_dqget_inode( xfs_dqid_t id; int error; + ASSERT(!*dqpp); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqget_checks(mp, type); if (error) return error; @@ -1068,8 +1071,8 @@ dqret: xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); found: - *O_dqpp = dqp; - mutex_lock(&dqp->q_qlock); + trace_xfs_dqattach_get(dqp); + *dqpp = dqp; return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b571eff51694..f3f7947bc4ed 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -287,40 +287,6 @@ xfs_qm_unmount_quotas( xfs_qm_destroy_quotainos(mp->m_quotainfo); } -STATIC int -xfs_qm_dqattach_one( - struct xfs_inode *ip, - xfs_dqtype_t type, - bool doalloc, - struct xfs_dquot **IO_idqpp) -{ - struct xfs_dquot *dqp; - int error; - - ASSERT(!*IO_idqpp); - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - - /* - * Find the dquot from somewhere. This bumps the reference count of - * dquot and returns it locked. This can return ENOENT if dquot didn't - * exist on disk and we didn't ask it to allocate; ESRCH if quotas got - * turned off suddenly. - */ - error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); - if (error) - return error; - - trace_xfs_dqattach_get(dqp); - - /* - * dqget may have dropped and re-acquired the ilock, but it guarantees - * that the dquot returned is the one that should go in the inode. - */ - *IO_idqpp = dqp; - mutex_unlock(&dqp->q_qlock); - return 0; -} - static bool xfs_qm_need_dqattach( struct xfs_inode *ip) @@ -360,7 +326,7 @@ xfs_qm_dqattach_locked( ASSERT(!xfs_is_metadir_inode(ip)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_USER, doalloc, &ip->i_udquot); if (error) goto done; @@ -368,7 +334,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; @@ -376,7 +342,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; From 55c1bc3eb9d0f39ea4c078b339a6228f5f62584b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:02 +0100 Subject: [PATCH 11/33] xfs: return the dquot unlocked from xfs_qm_dqget There is no reason to lock the dquot in xfs_qm_dqget, which just acquires a reference. Move the locking to the callers, or remove it in cases where the caller instantly unlocks the dquot. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/dqiterate.c | 1 + fs/xfs/scrub/quotacheck.c | 1 + fs/xfs/scrub/quotacheck_repair.c | 1 + fs/xfs/xfs_dquot.c | 4 ++-- fs/xfs/xfs_qm.c | 4 +--- fs/xfs/xfs_qm_bhv.c | 1 + fs/xfs/xfs_qm_syscalls.c | 2 ++ 7 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c index 20c4daedd48d..6f1185afbf39 100644 --- a/fs/xfs/scrub/dqiterate.c +++ b/fs/xfs/scrub/dqiterate.c @@ -205,6 +205,7 @@ xchk_dquot_iter( if (error) return error; + mutex_lock(&dq->q_qlock); cursor->id = dq->q_id + 1; *dqpp = dq; return 1; diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index 180449f654f6..bef63f19cd87 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -635,6 +635,7 @@ xqcheck_walk_observations( if (error) return error; + mutex_lock(&dq->q_qlock); error = xqcheck_compare_dquot(xqc, dqtype, dq); mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index 11153e24b565..3b23219d43ed 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -187,6 +187,7 @@ xqcheck_commit_dqtype( if (error) return error; + mutex_lock(&dq->q_qlock); error = xqcheck_commit_dquot(xqc, dqtype, dq); mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 862fec529512..1c9c17892874 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -895,7 +895,7 @@ xfs_qm_dqget_checks( /* * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a - * locked dquot, doing an allocation (if requested) as needed. + * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( @@ -940,7 +940,6 @@ restart: trace_xfs_dqget_miss(dqp); found: *O_dqpp = dqp; - mutex_lock(&dqp->q_qlock); return 0; } @@ -1098,6 +1097,7 @@ xfs_qm_dqget_next( else if (error != 0) break; + mutex_lock(&dqp->q_qlock); if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { *dqpp = dqp; return 0; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f3f7947bc4ed..a81b8b7a4e4f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1268,6 +1268,7 @@ xfs_qm_quotacheck_dqadjust( return error; } + mutex_lock(&dqp->q_qlock); error = xfs_dquot_attach_buf(NULL, dqp); if (error) goto out_unlock; @@ -1907,7 +1908,6 @@ xfs_qm_vop_dqalloc( /* * Get the ilock in the right order. */ - mutex_unlock(&uq->q_qlock); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { @@ -1929,7 +1929,6 @@ xfs_qm_vop_dqalloc( ASSERT(error != -ENOENT); goto error_rele; } - mutex_unlock(&gq->q_qlock); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { @@ -1947,7 +1946,6 @@ xfs_qm_vop_dqalloc( ASSERT(error != -ENOENT); goto error_rele; } - mutex_unlock(&pq->q_qlock); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index e5a30b12253c..edc0aef3cf34 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -73,6 +73,7 @@ xfs_qm_statvfs( struct xfs_dquot *dqp; if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { + mutex_lock(&dqp->q_qlock); xfs_fill_statvfs_from_dquot(statp, ip, dqp); mutex_unlock(&dqp->q_qlock); xfs_qm_dqrele(dqp); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 441f9806cddb..6c8924780d7a 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -302,6 +302,7 @@ xfs_qm_scall_setqlim( return error; } + mutex_lock(&dqp->q_qlock); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); mutex_unlock(&dqp->q_qlock); @@ -459,6 +460,7 @@ xfs_qm_scall_getquota( * If everything's NULL, this dquot doesn't quite exist as far as * our utility programs are concerned. */ + mutex_lock(&dqp->q_qlock); if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { error = -ENOENT; goto out_put; From e85e74e4c9a64993ec5f296719705a32feca93c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:03 +0100 Subject: [PATCH 12/33] xfs: remove q_qlock locking in xfs_qm_scall_setqlim q_type can't change for an existing dquot, so there is no need for the locking here. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm_syscalls.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 6c8924780d7a..022e2179c06b 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -302,9 +302,7 @@ xfs_qm_scall_setqlim( return error; } - mutex_lock(&dqp->q_qlock); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); - mutex_unlock(&dqp->q_qlock); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); if (error) From a536bf9bec6ac461ec48bc8627545d56e4e71e9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:04 +0100 Subject: [PATCH 13/33] xfs: push q_qlock acquisition from xchk_dquot_iter to the callers. There is no good reason to take q_qlock in xchk_dquot_iter, which just provides a reference to the dquot. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/dqiterate.c | 1 - fs/xfs/scrub/quota.c | 1 + fs/xfs/scrub/quota_repair.c | 1 + fs/xfs/scrub/quotacheck.c | 1 + fs/xfs/scrub/quotacheck_repair.c | 1 + 5 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c index 6f1185afbf39..20c4daedd48d 100644 --- a/fs/xfs/scrub/dqiterate.c +++ b/fs/xfs/scrub/dqiterate.c @@ -205,7 +205,6 @@ xchk_dquot_iter( if (error) return error; - mutex_lock(&dq->q_qlock); cursor->id = dq->q_id + 1; *dqpp = dq; return 1; diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index cfcd0fb66845..b711d36c5ec9 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -329,6 +329,7 @@ xchk_quota( /* Now look for things that the quota verifiers won't complain about. */ xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + mutex_lock(&dq->q_qlock); error = xchk_quota_item(&sqi, dq); mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index d4ce9e56d3ef..dae4889bdc84 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -512,6 +512,7 @@ xrep_quota_problems( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + mutex_lock(&dq->q_qlock); error = xrep_quota_item(&rqi, dq); mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index bef63f19cd87..20220afd90f1 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -675,6 +675,7 @@ xqcheck_compare_dqtype( /* Compare what we observed against the actual dquots. */ xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + mutex_lock(&dq->q_qlock); error = xqcheck_compare_dquot(xqc, dqtype, dq); mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index 3b23219d43ed..3013211fa6c1 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -155,6 +155,7 @@ xqcheck_commit_dqtype( */ xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + mutex_lock(&dq->q_qlock); error = xqcheck_commit_dquot(xqc, dqtype, dq); mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); From bfca8760f47ecda61441950babbea6f79a51b377 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:05 +0100 Subject: [PATCH 14/33] xfs: move q_qlock locking into xchk_quota_item This avoids a pointless roundtrip because ilock needs to be taken first. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quota.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index b711d36c5ec9..5c5374c44c5a 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -155,10 +155,7 @@ xchk_quota_item( * We want to validate the bmap record for the storage backing this * dquot, so we need to lock the dquot and the quota file. For quota * operations, the locking order is first the ILOCK and then the dquot. - * However, dqiterate gave us a locked dquot, so drop the dquot lock to - * get the ILOCK. */ - mutex_unlock(&dq->q_qlock); xchk_ilock(sc, XFS_ILOCK_SHARED); mutex_lock(&dq->q_qlock); @@ -251,6 +248,7 @@ xchk_quota_item( xchk_quota_item_timer(sc, offset, &dq->q_rtb); out: + mutex_unlock(&dq->q_qlock); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -329,9 +327,7 @@ xchk_quota( /* Now look for things that the quota verifiers won't complain about. */ xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { - mutex_lock(&dq->q_qlock); error = xchk_quota_item(&sqi, dq); - mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); if (error) break; From 7dd30acb4b3724ec4ecad1a6e2e19a33c0f0ace4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:06 +0100 Subject: [PATCH 15/33] xfs: move q_qlock locking into xqcheck_compare_dquot Instead of having both callers do it. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quotacheck.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index 20220afd90f1..d412a8359784 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -563,6 +563,7 @@ xqcheck_compare_dquot( return -ECANCELED; } + mutex_lock(&dq->q_qlock); mutex_lock(&xqc->lock); error = xfarray_load_sparse(counts, dq->q_id, &xcdq); if (error) @@ -589,7 +590,9 @@ xqcheck_compare_dquot( xchk_set_incomplete(xqc->sc); error = -ECANCELED; } +out_unlock: mutex_unlock(&xqc->lock); + mutex_unlock(&dq->q_qlock); if (error) return error; @@ -597,10 +600,6 @@ xqcheck_compare_dquot( return -ECANCELED; return 0; - -out_unlock: - mutex_unlock(&xqc->lock); - return error; } /* @@ -635,9 +634,7 @@ xqcheck_walk_observations( if (error) return error; - mutex_lock(&dq->q_qlock); error = xqcheck_compare_dquot(xqc, dqtype, dq); - mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); if (error) return error; @@ -675,9 +672,7 @@ xqcheck_compare_dqtype( /* Compare what we observed against the actual dquots. */ xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { - mutex_lock(&dq->q_qlock); error = xqcheck_compare_dquot(xqc, dqtype, dq); - mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); if (error) break; From a2ebb21f8ae1a8cc9414677ac7ddbf5c7cc6f48d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:07 +0100 Subject: [PATCH 16/33] xfs: move quota locking into xqcheck_commit_dquot Drop two redundant lock roundtrips by not requiring q_lock to be held on entry and return. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quotacheck_repair.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index 3013211fa6c1..51be8d8d261b 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -52,13 +52,11 @@ xqcheck_commit_dquot( bool dirty = false; int error = 0; - /* Unlock the dquot just long enough to allocate a transaction. */ - mutex_unlock(&dq->q_qlock); error = xchk_trans_alloc(xqc->sc, 0); - mutex_lock(&dq->q_qlock); if (error) return error; + mutex_lock(&dq->q_qlock); xfs_trans_dqjoin(xqc->sc->tp, dq); if (xchk_iscan_aborted(&xqc->iscan)) { @@ -115,23 +113,12 @@ xqcheck_commit_dquot( if (dq->q_id) xfs_qm_adjust_dqtimers(dq); xfs_trans_log_dquot(xqc->sc->tp, dq); - - /* - * Transaction commit unlocks the dquot, so we must re-lock it so that - * the caller can put the reference (which apparently requires a locked - * dquot). - */ - error = xrep_trans_commit(xqc->sc); - mutex_lock(&dq->q_qlock); - return error; + return xrep_trans_commit(xqc->sc); out_unlock: mutex_unlock(&xqc->lock); out_cancel: xchk_trans_cancel(xqc->sc); - - /* Re-lock the dquot so the caller can put the reference. */ - mutex_lock(&dq->q_qlock); return error; } @@ -155,9 +142,7 @@ xqcheck_commit_dqtype( */ xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { - mutex_lock(&dq->q_qlock); error = xqcheck_commit_dquot(xqc, dqtype, dq); - mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); if (error) break; @@ -188,9 +173,7 @@ xqcheck_commit_dqtype( if (error) return error; - mutex_lock(&dq->q_qlock); error = xqcheck_commit_dquot(xqc, dqtype, dq); - mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); if (error) return error; From b6d2ab27cc84b19afdf72eac1361fb343c4e0186 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:08 +0100 Subject: [PATCH 17/33] xfs: move quota locking into xrep_quota_item Drop two redundant lock roundtrips by not requiring q_lock to be held on entry and return. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quota_repair.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index dae4889bdc84..b1d661aa5f06 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -184,17 +184,13 @@ xrep_quota_item( /* * We might need to fix holes in the bmap record for the storage * backing this dquot, so we need to lock the dquot and the quota file. - * dqiterate gave us a locked dquot, so drop the dquot lock to get the - * ILOCK_EXCL. */ - mutex_unlock(&dq->q_qlock); xchk_ilock(sc, XFS_ILOCK_EXCL); mutex_lock(&dq->q_qlock); - error = xrep_quota_item_bmap(sc, dq, &dirty); xchk_iunlock(sc, XFS_ILOCK_EXCL); if (error) - return error; + goto out_unlock_dquot; /* Check the limits. */ if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { @@ -246,7 +242,7 @@ xrep_quota_item( xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); if (!dirty) - return 0; + goto out_unlock_dquot; trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); @@ -257,8 +253,10 @@ xrep_quota_item( xfs_qm_adjust_dqtimers(dq); } xfs_trans_log_dquot(sc->tp, dq); - error = xfs_trans_roll(&sc->tp); - mutex_lock(&dq->q_qlock); + return xfs_trans_roll(&sc->tp); + +out_unlock_dquot: + mutex_unlock(&dq->q_qlock); return error; } @@ -512,9 +510,7 @@ xrep_quota_problems( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { - mutex_lock(&dq->q_qlock); error = xrep_quota_item(&rqi, dq); - mutex_unlock(&dq->q_qlock); xfs_qm_dqrele(dq); if (error) break; From 13d3c1a045628e8453c31bd49578053c093e7a02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:09 +0100 Subject: [PATCH 18/33] xfs: move xfs_dquot_tree calls into xfs_qm_dqget_cache_{lookup,insert} These are the low-level functions that needs them, so localize the (trivial) calculation of the radix tree root there. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1c9c17892874..612ca682a513 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -801,10 +801,11 @@ xfs_dq_get_next_id( static struct xfs_dquot * xfs_qm_dqget_cache_lookup( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct radix_tree_root *tree, - xfs_dqid_t id) + xfs_dqid_t id, + xfs_dqtype_t type) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; restart: @@ -843,11 +844,12 @@ restart: static int xfs_qm_dqget_cache_insert( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct radix_tree_root *tree, xfs_dqid_t id, + xfs_dqtype_t type, struct xfs_dquot *dqp) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); unsigned int nofs_flags; int error; @@ -905,8 +907,6 @@ xfs_qm_dqget( bool can_alloc, struct xfs_dquot **O_dqpp) { - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; @@ -915,7 +915,7 @@ xfs_qm_dqget( return error; restart: - dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + dqp = xfs_qm_dqget_cache_lookup(mp, id, type); if (dqp) goto found; @@ -923,7 +923,7 @@ restart: if (error) return error; - error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + error = xfs_qm_dqget_cache_insert(mp, id, type, dqp); if (error) { xfs_qm_dqdestroy(dqp); if (error == -EEXIST) { @@ -996,8 +996,6 @@ xfs_qm_dqget_inode( struct xfs_dquot **dqpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; xfs_dqid_t id; int error; @@ -1016,7 +1014,7 @@ xfs_qm_dqget_inode( id = xfs_qm_id_for_quotatype(ip, type); restart: - dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + dqp = xfs_qm_dqget_cache_lookup(mp, id, type); if (dqp) goto found; @@ -1052,7 +1050,7 @@ restart: return -ESRCH; } - error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + error = xfs_qm_dqget_cache_insert(mp, id, type, dqp); if (error) { xfs_qm_dqdestroy(dqp); if (error == -EEXIST) { From 6a7bb6ccd00580461f01e86f592c7d8c7bb54793 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Nov 2025 14:23:10 +0100 Subject: [PATCH 19/33] xfs: reduce ilock roundtrips in xfs_qm_vop_dqalloc xfs_qm_vop_dqalloc only needs the (exclusive) ilock for attaching dquots to the inode if not done so yet. All the other locks don't touch the inode and don't need the ilock - the i_rwsem / iolock protects against changes to the IDs while we are in a method, and the ilock would not help because dropping it for the dqget calls would be racy anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm.c | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a81b8b7a4e4f..95be67ac6eb4 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1861,16 +1861,12 @@ xfs_qm_vop_dqalloc( struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; int error; - uint lockflags; if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_metadir_inode(ip)); - lockflags = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockflags); - if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) gid = inode->i_gid; @@ -1879,37 +1875,22 @@ xfs_qm_vop_dqalloc( * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, true); - if (error) { - xfs_iunlock(ip, lockflags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(O_udqpp); if (!uid_eq(inode->i_uid, uid)) { - /* - * What we need is the dquot that has this uid, and - * if we send the inode to dqget, the uid of the inode - * takes priority over what's sent in the uid argument. - * We must unlock inode here before calling dqget if - * we're not sending the inode, because otherwise - * we'll deadlock by doing trans_reserve while - * holding ilock. - */ - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), XFS_DQTYPE_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; } - /* - * Get the ilock in the right order. - */ - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { /* * Take an extra reference, because we'll return @@ -1922,15 +1903,12 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(O_gdqpp); if (!gid_eq(inode->i_gid, gid)) { - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), XFS_DQTYPE_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); gq = xfs_qm_dqhold(ip->i_gdquot); @@ -1939,15 +1917,12 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { ASSERT(O_pdqpp); if (ip->i_projid != prid) { - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_pdquot); pq = xfs_qm_dqhold(ip->i_pdquot); @@ -1955,7 +1930,6 @@ xfs_qm_vop_dqalloc( } trace_xfs_dquot_dqalloc(ip); - xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; else From 74d975ed6c9f8ba44179502a8ad5a839b38e8630 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:48 +0100 Subject: [PATCH 20/33] xfs: add a XLOG_CYCLE_DATA_SIZE constant The XLOG_HEADER_CYCLE_SIZE / BBSIZE expression is used a lot in the log code, give it a symbolic name. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 5 +++-- fs/xfs/xfs_log.c | 18 +++++++++--------- fs/xfs/xfs_log_recover.c | 6 +++--- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 6c50cb2ece19..91a841ea5bb3 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -31,6 +31,7 @@ typedef uint32_t xlog_tid_t; #define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ #define XLOG_MAX_RECORD_BSIZE (256*1024) #define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_CYCLE_DATA_SIZE (XLOG_HEADER_CYCLE_SIZE / BBSIZE) #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ @@ -135,7 +136,7 @@ typedef struct xlog_rec_header { __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ - __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + __be32 h_cycle_data[XLOG_CYCLE_DATA_SIZE]; /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ @@ -172,7 +173,7 @@ typedef struct xlog_rec_header { typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ - __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ + __be32 xh_cycle_data[XLOG_CYCLE_DATA_SIZE]; /* : 256 */ } xlog_rec_ext_header_t; /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 603e85c1ab4c..e09e5f71ed8c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1533,7 +1533,7 @@ xlog_pack_data( dp = iclog->ic_datap; for (i = 0; i < BTOBB(size); i++) { - if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + if (i >= XLOG_CYCLE_DATA_SIZE) break; iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; *(__be32 *)dp = cycle_lsn; @@ -1544,8 +1544,8 @@ xlog_pack_data( xlog_in_core_2_t *xhdr = iclog->ic_data; for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + j = i / XLOG_CYCLE_DATA_SIZE; + k = i % XLOG_CYCLE_DATA_SIZE; xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; *(__be32 *)dp = cycle_lsn; dp += BBSIZE; @@ -3368,9 +3368,9 @@ xlog_verify_iclog( clientid = ophead->oh_clientid; } else { idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + if (idx >= XLOG_CYCLE_DATA_SIZE) { + j = idx / XLOG_CYCLE_DATA_SIZE; + k = idx % XLOG_CYCLE_DATA_SIZE; clientid = xlog_get_client_id( xhdr[j].hic_xheader.xh_cycle_data[k]); } else { @@ -3392,9 +3392,9 @@ xlog_verify_iclog( op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + if (idx >= XLOG_CYCLE_DATA_SIZE) { + j = idx / XLOG_CYCLE_DATA_SIZE; + k = idx % XLOG_CYCLE_DATA_SIZE; op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); } else { op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 549d60959aee..bb2b3f976deb 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2866,7 +2866,7 @@ xlog_unpack_data( int i, j, k; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + i < XLOG_CYCLE_DATA_SIZE; i++) { *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; dp += BBSIZE; } @@ -2874,8 +2874,8 @@ xlog_unpack_data( if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + j = i / XLOG_CYCLE_DATA_SIZE; + k = i % XLOG_CYCLE_DATA_SIZE; *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; dp += BBSIZE; } From 899b7ee44baebcfb2b2366b2aff6e9aca4486c4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:49 +0100 Subject: [PATCH 21/33] xfs: add a on-disk log header cycle array accessor Accessing the cycle arrays in the original log record header vs the extended header is messy and duplicated in multiple places. Add a xlog_cycle_data helper to abstract it out. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 63 ++++++++++------------------------------ fs/xfs/xfs_log_priv.h | 18 ++++++++++++ fs/xfs/xfs_log_recover.c | 17 ++--------- 3 files changed, 37 insertions(+), 61 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e09e5f71ed8c..a569a4320a3a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1524,18 +1524,13 @@ xlog_pack_data( struct xlog_in_core *iclog, int roundoff) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - char *dp; + struct xlog_rec_header *rhead = &iclog->ic_header; + __be32 cycle_lsn = CYCLE_LSN_DISK(rhead->h_lsn); + char *dp = iclog->ic_datap; + int i; - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size); i++) { - if (i >= XLOG_CYCLE_DATA_SIZE) - break; - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + for (i = 0; i < BTOBB(iclog->ic_offset + roundoff); i++) { + *xlog_cycle_data(rhead, i) = *(__be32 *)dp; *(__be32 *)dp = cycle_lsn; dp += BBSIZE; } @@ -1543,14 +1538,6 @@ xlog_pack_data( if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = iclog->ic_data; - for ( ; i < BTOBB(size); i++) { - j = i / XLOG_CYCLE_DATA_SIZE; - k = i % XLOG_CYCLE_DATA_SIZE; - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - for (i = 1; i < log->l_iclog_heads; i++) xhdr[i].hic_xheader.xh_cycle = cycle_lsn; } @@ -3322,13 +3309,12 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - struct xlog_op_header *ophead; + struct xlog_rec_header *rhead = &iclog->ic_header; xlog_in_core_t *icptr; - xlog_in_core_2_t *xhdr; - void *base_ptr, *ptr, *p; + void *base_ptr, *ptr; ptrdiff_t field_offset; uint8_t clientid; - int len, i, j, k, op_len; + int len, i, op_len; int idx; /* check validity of iclog pointers */ @@ -3342,11 +3328,10 @@ xlog_verify_iclog( spin_unlock(&log->l_icloglock); /* check log magic numbers */ - if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + if (rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - base_ptr = ptr = &iclog->ic_header; - p = &iclog->ic_header; + base_ptr = ptr = rhead; for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", @@ -3354,29 +3339,19 @@ xlog_verify_iclog( } /* check fields */ - len = be32_to_cpu(iclog->ic_header.h_num_logops); + len = be32_to_cpu(rhead->h_num_logops); base_ptr = ptr = iclog->ic_datap; - ophead = ptr; - xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = ptr; + struct xlog_op_header *ophead = ptr; + void *p = &ophead->oh_clientid; /* clientid is only 1 byte */ - p = &ophead->oh_clientid; field_offset = p - base_ptr; if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); - if (idx >= XLOG_CYCLE_DATA_SIZE) { - j = idx / XLOG_CYCLE_DATA_SIZE; - k = idx % XLOG_CYCLE_DATA_SIZE; - clientid = xlog_get_client_id( - xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - clientid = xlog_get_client_id( - iclog->ic_header.h_cycle_data[idx]); - } + clientid = xlog_get_client_id(*xlog_cycle_data(rhead, idx)); } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, @@ -3392,13 +3367,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); - if (idx >= XLOG_CYCLE_DATA_SIZE) { - j = idx / XLOG_CYCLE_DATA_SIZE; - k = idx % XLOG_CYCLE_DATA_SIZE; - op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); - } + op_len = be32_to_cpu(*xlog_cycle_data(rhead, idx)); } ptr += sizeof(struct xlog_op_header) + op_len; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 0cfc654d8e87..d2f17691ecca 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -711,4 +711,22 @@ xlog_item_space( return round_up(nbytes, sizeof(uint64_t)); } +/* + * Cycles over XLOG_CYCLE_DATA_SIZE overflow into the extended header that was + * added for v2 logs. Addressing for the cycles array there is off by one, + * because the first batch of cycles is in the original header. + */ +static inline __be32 *xlog_cycle_data(struct xlog_rec_header *rhead, unsigned i) +{ + if (i >= XLOG_CYCLE_DATA_SIZE) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + unsigned j = i / XLOG_CYCLE_DATA_SIZE; + unsigned k = i % XLOG_CYCLE_DATA_SIZE; + + return &xhdr[j].hic_xheader.xh_cycle_data[k]; + } + + return &rhead->h_cycle_data[i]; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index bb2b3f976deb..ef0f6efc4381 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2863,23 +2863,12 @@ xlog_unpack_data( char *dp, struct xlog *log) { - int i, j, k; + int i; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < XLOG_CYCLE_DATA_SIZE; i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + *(__be32 *)dp = *xlog_cycle_data(rhead, i); dp += BBSIZE; } - - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / XLOG_CYCLE_DATA_SIZE; - k = i % XLOG_CYCLE_DATA_SIZE; - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } } /* From be665a4e27417227cf40cfe27e616838bb46548c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:50 +0100 Subject: [PATCH 22/33] xfs: don't use xlog_in_core_2_t in struct xlog_in_core Most accessed to the on-disk log record header are for the original xlog_rec_header. Make that the main structure, and case for the single remaining place using other union legs. This prepares for removing xlog_in_core_2_t entirely. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 74 +++++++++++++++++++++---------------------- fs/xfs/xfs_log_cil.c | 6 ++-- fs/xfs/xfs_log_priv.h | 9 ++---- fs/xfs/xfs_trace.h | 2 +- 4 files changed, 44 insertions(+), 47 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a569a4320a3a..d9476124def6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -534,8 +534,8 @@ xlog_state_release_iclog( */ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && - !iclog->ic_header.h_tail_lsn) { - iclog->ic_header.h_tail_lsn = + !iclog->ic_header->h_tail_lsn) { + iclog->ic_header->h_tail_lsn = cpu_to_be64(atomic64_read(&log->l_tail_lsn)); } @@ -1457,11 +1457,11 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kvzalloc(log->l_iclog_size, + iclog->ic_header = kvzalloc(log->l_iclog_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (!iclog->ic_data) + if (!iclog->ic_header) goto out_free_iclog; - head = &iclog->ic_header; + head = iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); head->h_version = cpu_to_be32( @@ -1476,7 +1476,7 @@ xlog_alloc_log( iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; + iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1504,7 +1504,7 @@ out_destroy_workqueue: out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - kvfree(iclog->ic_data); + kvfree(iclog->ic_header); kfree(iclog); if (prev_iclog == log->l_iclog) break; @@ -1524,7 +1524,7 @@ xlog_pack_data( struct xlog_in_core *iclog, int roundoff) { - struct xlog_rec_header *rhead = &iclog->ic_header; + struct xlog_rec_header *rhead = iclog->ic_header; __be32 cycle_lsn = CYCLE_LSN_DISK(rhead->h_lsn); char *dp = iclog->ic_datap; int i; @@ -1536,7 +1536,7 @@ xlog_pack_data( } if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)iclog->ic_header; for (i = 1; i < log->l_iclog_heads; i++) xhdr[i].hic_xheader.xh_cycle = cycle_lsn; @@ -1658,11 +1658,11 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (is_vmalloc_addr(iclog->ic_data)) { - if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count)) + if (is_vmalloc_addr(iclog->ic_header)) { + if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_header, count)) goto shutdown; } else { - bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count); + bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_header, count); } /* @@ -1791,19 +1791,19 @@ xlog_sync( size = iclog->ic_offset; if (xfs_has_logv2(log->l_mp)) size += roundoff; - iclog->ic_header.h_len = cpu_to_be32(size); + iclog->ic_header->h_len = cpu_to_be32(size); XFS_STATS_INC(log->l_mp, xs_log_writes); XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); - bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); + bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header->h_lsn)); /* Do we need to split this write into 2 parts? */ if (bno + BTOBB(count) > log->l_logBBsize) - xlog_split_iclog(log, &iclog->ic_header, bno, count); + xlog_split_iclog(log, iclog->ic_header, bno, count); /* calculcate the checksum */ - iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_header->h_crc = xlog_cksum(log, iclog->ic_header, iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection @@ -1814,11 +1814,11 @@ xlog_sync( */ #ifdef DEBUG if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { - iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); + iclog->ic_header->h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", - be64_to_cpu(iclog->ic_header.h_lsn)); + be64_to_cpu(iclog->ic_header->h_lsn)); } #endif xlog_verify_iclog(log, iclog, count); @@ -1845,7 +1845,7 @@ xlog_dealloc_log( iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; - kvfree(iclog->ic_data); + kvfree(iclog->ic_header); kfree(iclog); iclog = next_iclog; } @@ -1867,7 +1867,7 @@ xlog_state_finish_copy( { lockdep_assert_held(&log->l_icloglock); - be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + be32_add_cpu(&iclog->ic_header->h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; } @@ -2290,7 +2290,7 @@ xlog_state_activate_iclog( * We don't need to cover the dummy. */ if (*iclogs_changed == 0 && - iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + iclog->ic_header->h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { *iclogs_changed = 1; } else { /* @@ -2302,11 +2302,11 @@ xlog_state_activate_iclog( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - iclog->ic_header.h_tail_lsn = 0; + iclog->ic_header->h_num_logops = 0; + memset(iclog->ic_header->h_cycle_data, 0, + sizeof(iclog->ic_header->h_cycle_data)); + iclog->ic_header->h_lsn = 0; + iclog->ic_header->h_tail_lsn = 0; } /* @@ -2398,7 +2398,7 @@ xlog_get_lowest_lsn( iclog->ic_state == XLOG_STATE_DIRTY) continue; - lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lsn = be64_to_cpu(iclog->ic_header->h_lsn); if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) lowest_lsn = lsn; } while ((iclog = iclog->ic_next) != log->l_iclog); @@ -2433,7 +2433,7 @@ xlog_state_iodone_process_iclog( * If this is not the lowest lsn iclog, then we will leave it * for another completion to process. */ - header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + header_lsn = be64_to_cpu(iclog->ic_header->h_lsn); lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) return false; @@ -2616,7 +2616,7 @@ restart: goto restart; } - head = &iclog->ic_header; + head = iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; @@ -2781,7 +2781,7 @@ xlog_state_switch_iclogs( if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; - iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + iclog->ic_header->h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; @@ -2825,7 +2825,7 @@ xlog_force_and_check_iclog( struct xlog_in_core *iclog, bool *completed) { - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); int error; *completed = false; @@ -2837,7 +2837,7 @@ xlog_force_and_check_iclog( * If the iclog has already been completed and reused the header LSN * will have been rewritten by completion */ - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + if (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) *completed = true; return 0; } @@ -2970,7 +2970,7 @@ xlog_force_lsn( goto out_error; iclog = log->l_iclog; - while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + while (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) @@ -3236,7 +3236,7 @@ xlog_verify_dump_tail( { xfs_alert(log->l_mp, "ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x", - iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -1, + iclog ? be64_to_cpu(iclog->ic_header->h_tail_lsn) : -1, atomic64_read(&log->l_tail_lsn), log->l_ailp->ail_head_lsn, log->l_curr_cycle, log->l_curr_block, @@ -3255,7 +3255,7 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog) { - xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header->h_tail_lsn); int blocks; if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { @@ -3309,7 +3309,7 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - struct xlog_rec_header *rhead = &iclog->ic_header; + struct xlog_rec_header *rhead = iclog->ic_header; xlog_in_core_t *icptr; void *base_ptr, *ptr; ptrdiff_t field_offset; @@ -3507,7 +3507,7 @@ xlog_iclogs_empty( /* endianness does not matter here, zero is zero in * any language. */ - if (iclog->ic_header.h_num_logops) + if (iclog->ic_header->h_num_logops) return 0; iclog = iclog->ic_next; } while (iclog != log->l_iclog); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f443757e93c2..778ac47adb8c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -940,7 +940,7 @@ xlog_cil_set_ctx_write_state( struct xlog_in_core *iclog) { struct xfs_cil *cil = ctx->cil; - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); ASSERT(!ctx->commit_lsn); if (!ctx->start_lsn) { @@ -1458,9 +1458,9 @@ xlog_cil_push_work( */ spin_lock(&log->l_icloglock); if (ctx->start_lsn != ctx->commit_lsn) { - xfs_lsn_t plsn; + xfs_lsn_t plsn = be64_to_cpu( + ctx->commit_iclog->ic_prev->ic_header->h_lsn); - plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn); if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) { /* * Waiting on ic_force_wait orders the completion of diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index d2f17691ecca..f1aed6e8f747 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -158,10 +158,8 @@ struct xlog_ticket { }; /* - * - A log record header is 512 bytes. There is plenty of room to grow the - * xlog_rec_header_t into the reserved space. - * - ic_data follows, so a write to disk can start at the beginning of - * the iclog. + * In-core log structure. + * * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_log is a pointer back to the global log structure. @@ -198,8 +196,7 @@ typedef struct xlog_in_core { /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; - xlog_in_core_2_t *ic_data; -#define ic_header ic_data->hic_header + struct xlog_rec_header *ic_header; #ifdef DEBUG bool ic_fail_crc : 1; #endif diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 90582ff7c2cf..f70afbf3cb19 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4932,7 +4932,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, __entry->refcount = atomic_read(&iclog->ic_refcnt); __entry->offset = iclog->ic_offset; __entry->flags = iclog->ic_flags; - __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); + __entry->lsn = be64_to_cpu(iclog->ic_header->h_lsn); __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS", From 16c18021e1f518e6ddd4ddf2b57aaca7a47a7124 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:51 +0100 Subject: [PATCH 23/33] xfs: cleanup xlog_alloc_log a bit Remove the separate head variable, move the ic_datap initialization up a bit where the context is more obvious and remove the duplicate memset right after a zeroing memory allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d9476124def6..3bd2f8787682 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1367,7 +1367,6 @@ xlog_alloc_log( int num_bblks) { struct xlog *log; - xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; int i; @@ -1461,22 +1460,21 @@ xlog_alloc_log( GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_header) goto out_free_iclog; - head = iclog->ic_header; - memset(head, 0, sizeof(xlog_rec_header_t)); - head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); - head->h_version = cpu_to_be32( + iclog->ic_header->h_magicno = + cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + iclog->ic_header->h_version = cpu_to_be32( xfs_has_logv2(log->l_mp) ? 2 : 1); - head->h_size = cpu_to_be32(log->l_iclog_size); - /* new fields */ - head->h_fmt = cpu_to_be32(XLOG_FMT); - memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + iclog->ic_header->h_size = cpu_to_be32(log->l_iclog_size); + iclog->ic_header->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&iclog->ic_header->h_fs_uuid, &mp->m_sb.sb_uuid, + sizeof(iclog->ic_header->h_fs_uuid)); + iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize; iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); From 9ed9df98fcd7203c0eeac21e6784bb7cc7a291d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:52 +0100 Subject: [PATCH 24/33] xfs: remove a very outdated comment from xlog_alloc_log The xlog_iclog definition has been pretty standard for a while, so drop this now rather misleading comment. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3bd2f8787682..acddab467b77 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1435,13 +1435,6 @@ xlog_alloc_log( init_waitqueue_head(&log->l_flush_wait); iclogp = &log->l_iclog; - /* - * The amount of memory to allocate for the iclog structure is - * rather funky due to the way the structure is defined. It is - * done this way so that we can use different sizes for machines - * with different amounts of memory. See the definition of - * xlog_in_core_t in xfs_log_priv.h for details. - */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * From fe985b910e03fd91193f399a1aca9d1ea22c2557 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:53 +0100 Subject: [PATCH 25/33] xfs: remove xlog_in_core_2_t xlog_in_core_2_t is a really odd type, not only is it grossly misnamed because it actually is an on-disk structure, but it also reprents the actual on-disk structure in a rather odd way. A v1 or small v2 log header look like: +-----------------------+ | xlog_record | +-----------------------+ while larger v2 log headers look like: +-----------------------+ | xlog_record | +-----------------------+ | xlog_rec_ext_header | +-------------------+---+ | ..... | +-----------------------+ | xlog_rec_ext_header | +-----------------------+ I.e., the ext headers are a variable sized array at the end of the header. So instead of declaring a union of xlog_rec_header, xlog_rec_ext_header and padding to BBSIZE, add the proper padding to struct struct xlog_rec_header and struct xlog_rec_ext_header, and add a variable sized array of the latter to the former. This also exposes the somewhat unusual scope of the log checksums, which is made explicitly now by adding proper padding and macro designating the actual payload length. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 31 +++++++++++++++---------------- fs/xfs/libxfs/xfs_ondisk.h | 6 ++++-- fs/xfs/xfs_log.c | 21 ++++++--------------- fs/xfs/xfs_log_priv.h | 3 +-- 4 files changed, 26 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 91a841ea5bb3..4cb69bd285ca 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -126,6 +126,16 @@ struct xlog_op_header { #define XLOG_FMT XLOG_FMT_LINUX_LE #endif +struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log */ + __be32 xh_cycle_data[XLOG_CYCLE_DATA_SIZE]; + __u8 xh_reserved[252]; +}; + +/* actual ext header payload size for checksumming */ +#define XLOG_REC_EXT_SIZE \ + offsetofend(struct xlog_rec_ext_header, xh_cycle_data) + typedef struct xlog_rec_header { __be32 h_magicno; /* log record (LR) identifier : 4 */ __be32 h_cycle; /* write cycle of log : 4 */ @@ -161,30 +171,19 @@ typedef struct xlog_rec_header { * (little-endian) architectures. */ __u32 h_pad0; + + __u8 h_reserved[184]; + struct xlog_rec_ext_header h_ext[]; } xlog_rec_header_t; #ifdef __i386__ #define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) -#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_pad0) #else -#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_pad0) #define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) #endif /* __i386__ */ -typedef struct xlog_rec_ext_header { - __be32 xh_cycle; /* write cycle of log : 4 */ - __be32 xh_cycle_data[XLOG_CYCLE_DATA_SIZE]; /* : 256 */ -} xlog_rec_ext_header_t; - -/* - * Quite misnamed, because this union lays out the actual on-disk log buffer. - */ -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; - /* not an on-disk structure, but needed by log recovery in userspace */ struct xfs_log_iovec { void *i_addr; /* beginning address of region */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 7bfa3242e2c5..2e9715cc1641 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,9 +174,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); - XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); - XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 512); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 512); + XFS_CHECK_OFFSET(struct xlog_rec_header, h_reserved, 328); + XFS_CHECK_OFFSET(struct xlog_rec_ext_header, xh_reserved, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index acddab467b77..1fe3abbd3d36 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1526,12 +1526,8 @@ xlog_pack_data( dp += BBSIZE; } - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)iclog->ic_header; - - for (i = 1; i < log->l_iclog_heads; i++) - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + for (i = 0; i < log->l_iclog_heads - 1; i++) + rhead->h_ext[i].xh_cycle = cycle_lsn; } /* @@ -1556,16 +1552,11 @@ xlog_cksum( /* ... then for additional cycle data for v2 logs ... */ if (xfs_has_logv2(log->l_mp)) { - union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; - int i; - int xheads; + int xheads, i; - xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); - - for (i = 1; i < xheads; i++) { - crc = crc32c(crc, &xhdr[i].hic_xheader, - sizeof(struct xlog_rec_ext_header)); - } + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE) - 1; + for (i = 0; i < xheads; i++) + crc = crc32c(crc, &rhead->h_ext[i], XLOG_REC_EXT_SIZE); } /* ... and finally for the payload */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f1aed6e8f747..ac98ac71152d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -716,11 +716,10 @@ xlog_item_space( static inline __be32 *xlog_cycle_data(struct xlog_rec_header *rhead, unsigned i) { if (i >= XLOG_CYCLE_DATA_SIZE) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; unsigned j = i / XLOG_CYCLE_DATA_SIZE; unsigned k = i % XLOG_CYCLE_DATA_SIZE; - return &xhdr[j].hic_xheader.xh_cycle_data[k]; + return &rhead->h_ext[j - 1].xh_cycle_data[k]; } return &rhead->h_cycle_data[i]; From ef1e275638fe6f6d54c18a770c138e4d5972b280 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:54 +0100 Subject: [PATCH 26/33] xfs: remove the xlog_rec_header_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_log.c | 6 +++--- fs/xfs/xfs_log_recover.c | 28 ++++++++++++++-------------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 4cb69bd285ca..908e7060428c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -136,7 +136,7 @@ struct xlog_rec_ext_header { #define XLOG_REC_EXT_SIZE \ offsetofend(struct xlog_rec_ext_header, xh_cycle_data) -typedef struct xlog_rec_header { +struct xlog_rec_header { __be32 h_magicno; /* log record (LR) identifier : 4 */ __be32 h_cycle; /* write cycle of log : 4 */ __be32 h_version; /* LR version : 4 */ @@ -174,7 +174,7 @@ typedef struct xlog_rec_header { __u8 h_reserved[184]; struct xlog_rec_ext_header h_ext[]; -} xlog_rec_header_t; +}; #ifdef __i386__ #define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1fe3abbd3d36..8b3b79699596 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2578,9 +2578,9 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *logoffsetp) { - int log_offset; - xlog_rec_header_t *head; - xlog_in_core_t *iclog; + int log_offset; + struct xlog_rec_header *head; + struct xlog_in_core *iclog; restart: spin_lock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ef0f6efc4381..03e42c7dab56 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -190,8 +190,8 @@ xlog_bwrite( */ STATIC void xlog_header_check_dump( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); @@ -207,8 +207,8 @@ xlog_header_check_dump( */ STATIC int xlog_header_check_recover( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -238,8 +238,8 @@ xlog_header_check_recover( */ STATIC int xlog_header_check_mount( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -400,7 +400,7 @@ xlog_find_verify_log_record( xfs_daddr_t i; char *buffer; char *offset = NULL; - xlog_rec_header_t *head = NULL; + struct xlog_rec_header *head = NULL; int error = 0; int smallmem = 0; int num_blks = *last_blk - start_blk; @@ -437,7 +437,7 @@ xlog_find_verify_log_record( goto out; } - head = (xlog_rec_header_t *)offset; + head = (struct xlog_rec_header *)offset; if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; @@ -1237,7 +1237,7 @@ xlog_find_tail( xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; char *offset = NULL; char *buffer; int error; @@ -1487,7 +1487,7 @@ xlog_add_record( int tail_cycle, int tail_block) { - xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + struct xlog_rec_header *recp = (struct xlog_rec_header *)buf; memset(buf, 0, BBSIZE); recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -2997,7 +2997,7 @@ xlog_do_recovery_pass( int pass, xfs_daddr_t *first_bad) /* out: first bad log rec */ { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; @@ -3034,7 +3034,7 @@ xlog_do_recovery_pass( if (error) goto bread_err1; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; /* * xfsprogs has a bug where record length is based on lsunit but @@ -3141,7 +3141,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, split_hblks ? blk_no : 0, h_size); if (error) @@ -3223,7 +3223,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; From bc2dd9f2ba004cb4cce671dbe62f5193f58e4abc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:55 +0100 Subject: [PATCH 27/33] xfs: remove l_iclog_heads l_iclog_heads is only used in one place and can be trivially derived from l_iclog_hsize by a single shift operation. Remove it, and switch the initialization of l_iclog_hsize to use struct_size so that it is directly derived from the on-disk format definition. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 11 ++++++----- fs/xfs/xfs_log_priv.h | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8b3b79699596..47a8e74c8c5c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1279,11 +1279,12 @@ xlog_get_iclog_buffer_size( log->l_iclog_size = mp->m_logbsize; /* - * # headers = size / 32k - one header holds cycles from 32k of data. + * Combined size of the log record headers. The first 32k cycles + * are stored directly in the xlog_rec_header, the rest in the + * variable number of xlog_rec_ext_headers at its end. */ - log->l_iclog_heads = - DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); - log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; + log->l_iclog_hsize = struct_size(log->l_iclog->ic_header, h_ext, + DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE) - 1); } void @@ -1526,7 +1527,7 @@ xlog_pack_data( dp += BBSIZE; } - for (i = 0; i < log->l_iclog_heads - 1; i++) + for (i = 0; i < (log->l_iclog_hsize >> BBSHIFT) - 1; i++) rhead->h_ext[i].xh_cycle = cycle_lsn; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ac98ac71152d..17733ba7f251 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -406,7 +406,6 @@ struct xlog { struct list_head *l_buf_cancel_table; struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ - int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_bufs; /* number of iclog buffers */ From 6731f85d38aa476275183ccdd73527cd6d7f3297 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:05:56 +0100 Subject: [PATCH 28/33] xfs: remove the xlog_in_core_t typedef Switch the few remaining users to use the underlying struct directly. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 18 +++++++++--------- fs/xfs/xfs_log_priv.h | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 47a8e74c8c5c..a311385b23d8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1368,8 +1368,8 @@ xlog_alloc_log( int num_bblks) { struct xlog *log; - xlog_in_core_t **iclogp; - xlog_in_core_t *iclog, *prev_iclog=NULL; + struct xlog_in_core **iclogp; + struct xlog_in_core *iclog, *prev_iclog = NULL; int i; int error = -ENOMEM; uint log2_size = 0; @@ -1813,10 +1813,10 @@ xlog_sync( */ STATIC void xlog_dealloc_log( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog, *next_iclog; - int i; + struct xlog_in_core *iclog, *next_iclog; + int i; /* * Destroy the CIL after waiting for iclog IO completion because an @@ -3293,7 +3293,7 @@ xlog_verify_iclog( int count) { struct xlog_rec_header *rhead = iclog->ic_header; - xlog_in_core_t *icptr; + struct xlog_in_core *icptr; void *base_ptr, *ptr; ptrdiff_t field_offset; uint8_t clientid; @@ -3481,11 +3481,10 @@ xlog_force_shutdown( STATIC int xlog_iclogs_empty( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog; + struct xlog_in_core *iclog = log->l_iclog; - iclog = log->l_iclog; do { /* endianness does not matter here, zero is zero in * any language. @@ -3494,6 +3493,7 @@ xlog_iclogs_empty( return 0; iclog = iclog->ic_next; } while (iclog != log->l_iclog); + return 1; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 17733ba7f251..0fe59f0525aa 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -181,7 +181,7 @@ struct xlog_ticket { * We'll put all the read-only and l_icloglock fields in the first cacheline, * and move everything else out to subsequent cachelines. */ -typedef struct xlog_in_core { +struct xlog_in_core { wait_queue_head_t ic_force_wait; wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; @@ -204,7 +204,7 @@ typedef struct xlog_in_core { struct work_struct ic_end_io_work; struct bio ic_bio; struct bio_vec ic_bvec[]; -} xlog_in_core_t; +}; /* * The CIL context is used to aggregate per-transaction details as well be @@ -418,7 +418,7 @@ struct xlog { /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ - xlog_in_core_t *l_iclog; /* head log queue */ + struct xlog_in_core *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last From bf3b8e915215ef78319b896c0ccc14dc57dac80f Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 31 Oct 2025 09:29:48 +0100 Subject: [PATCH 29/33] xfs: remove xarray mark for reclaimable zones We can easily check if there are any reclaimble zones by just looking at the used counters in the reclaim buckets, so do that to free up the xarray mark we currently use for this purpose. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_rtgroup.h | 6 ------ fs/xfs/xfs_zone_alloc.c | 26 ++++++++++++++++++++++---- fs/xfs/xfs_zone_gc.c | 2 +- fs/xfs/xfs_zone_priv.h | 1 + fs/xfs/xfs_zone_space_resv.c | 2 +- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index a94e925ae67c..03f1e2493334 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -64,12 +64,6 @@ struct xfs_rtgroup { */ #define XFS_RTG_FREE XA_MARK_0 -/* - * For zoned RT devices this is set on groups that are fully written and that - * have unused blocks. Used by the garbage collection to pick targets. - */ -#define XFS_RTG_RECLAIMABLE XA_MARK_1 - static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index ef7a931ebde5..0a118376c57c 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -103,9 +103,6 @@ xfs_zone_account_reclaimable( */ trace_xfs_zone_emptied(rtg); - if (!was_full) - xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); - spin_lock(&zi->zi_used_buckets_lock); if (!was_full) xfs_zone_remove_from_bucket(zi, rgno, from_bucket); @@ -127,7 +124,6 @@ xfs_zone_account_reclaimable( xfs_zone_add_to_bucket(zi, rgno, to_bucket); spin_unlock(&zi->zi_used_buckets_lock); - xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) wake_up_process(zi->zi_gc_thread); } else if (to_bucket != from_bucket) { @@ -142,6 +138,28 @@ xfs_zone_account_reclaimable( } } +/* + * Check if we have any zones that can be reclaimed by looking at the entry + * counters for the zone buckets. + */ +bool +xfs_zoned_have_reclaimable( + struct xfs_zone_info *zi) +{ + int i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + if (zi->zi_used_bucket_entries[i]) { + spin_unlock(&zi->zi_used_buckets_lock); + return true; + } + } + spin_unlock(&zi->zi_used_buckets_lock); + + return false; +} + static void xfs_open_zone_mark_full( struct xfs_open_zone *oz) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index a98939aba7b9..d786162ada1c 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -175,7 +175,7 @@ xfs_zoned_need_gc( s64 available, free, threshold; s32 remainder; - if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) return false; available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index 4322e26dd99a..ce7f0e2f4598 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -113,6 +113,7 @@ struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); bool xfs_zoned_need_gc(struct xfs_mount *mp); +bool xfs_zoned_have_reclaimable(struct xfs_zone_info *zi); int xfs_zone_gc_mount(struct xfs_mount *mp); void xfs_zone_gc_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c index 0e54e557a585..fc1a4d1ce10c 100644 --- a/fs/xfs/xfs_zone_space_resv.c +++ b/fs/xfs/xfs_zone_space_resv.c @@ -172,7 +172,7 @@ xfs_zoned_reserve_available( * processing a pending GC request give up as we're fully out * of space. */ - if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + if (!xfs_zoned_have_reclaimable(mp->m_zone_info) && !xfs_is_zonegc_running(mp)) break; From 9b0305968d60a7672a7db29c07cfbe03bc5ae3ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Oct 2025 08:01:50 +0100 Subject: [PATCH 30/33] xfs: remove the unused bv field in struct xfs_gc_bio Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Hans Holmberg Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index d786162ada1c..b6a2ecca2e49 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -117,7 +117,6 @@ struct xfs_gc_bio { struct xfs_rtgroup *victim_rtg; /* Bio used for reads and writes, including the bvec used by it */ - struct bio_vec bv; struct bio bio; /* must be last */ }; From 1cfe3795c152c7415a9f49fc1e7f623c855d14ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Nov 2025 07:49:42 +0100 Subject: [PATCH 31/33] xfs: use zi more in xfs_zone_gc_mount Use the local variable instead of the extra pointer dereference when starting the GC thread. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index b6a2ecca2e49..3c52cc1497d4 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -1182,16 +1182,16 @@ xfs_zone_gc_mount( goto out_put_gc_zone; } - mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, "xfs-zone-gc/%s", mp->m_super->s_id); - if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + if (IS_ERR(zi->zi_gc_thread)) { xfs_warn(mp, "unable to create zone gc thread"); - error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + error = PTR_ERR(zi->zi_gc_thread); goto out_free_gc_data; } /* xfs_zone_gc_start will unpark for rw mounts */ - kthread_park(mp->m_zone_info->zi_gc_thread); + kthread_park(zi->zi_gc_thread); return 0; out_free_gc_data: From dcfa98bb5f7816aec787e74871982b06607ba9cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Nov 2025 14:54:14 +0100 Subject: [PATCH 32/33] xfs: move some code out of xfs_iget_recycle Having a function drop locks, reacquire them and release them again seems to confuse the clang lock analysis even more than it confuses humans. Keep the humans and machines sanity by moving a chunk of code into the caller to simplify the lock tracking. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e44040206851..546efa6cec72 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -358,7 +358,7 @@ xfs_reinit_inode( static int xfs_iget_recycle( struct xfs_perag *pag, - struct xfs_inode *ip) __releases(&ip->i_flags_lock) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -366,20 +366,6 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) - return -EAGAIN; - - /* - * We need to make it look like the inode is being reclaimed to prevent - * the actual reclaim workers from stomping over us while we recycle - * the inode. We can't clear the radix tree tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -576,10 +562,19 @@ xfs_iget_cache_hit( /* The inode fits the selection criteria; process it. */ if (ip->i_flags & XFS_IRECLAIMABLE) { - /* Drops i_flags_lock and RCU read lock. */ - error = xfs_iget_recycle(pag, ip); - if (error == -EAGAIN) + /* + * We need to make it look like the inode is being reclaimed to + * prevent the actual reclaim workers from stomping over us + * while we recycle the inode. We can't clear the radix tree + * tag yet as it requires pag_ici_lock to be held exclusive. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out_skip; + ip->i_flags |= XFS_IRECLAIM; + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = xfs_iget_recycle(pag, ip); if (error) return error; } else { From 69ceb8a2d6665625d816fcf8ccd01965cddb233e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 4 Nov 2025 16:48:40 -0800 Subject: [PATCH 33/33] docs: remove obsolete links in the xfs online repair documentation Online repair is now merged in upstream, no need to point to patchset links anymore. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- .../xfs/xfs-online-fsck-design.rst | 236 +----------------- 1 file changed, 6 insertions(+), 230 deletions(-) diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index 8cbcd3c26434..189d1f5f4078 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -105,10 +105,8 @@ occur; this capability aids both strategies. TLDR; Show Me the Code! ----------------------- -Code is posted to the kernel.org git trees as follows: -`kernel changes `_, -`userspace changes `_, and -`QA test changes `_. +Kernel and userspace code has been fully merged as of October 2025. + Each kernel patchset adding an online repair function will use the same branch name across the kernel, xfsprogs, and fstests git repos. @@ -764,12 +762,8 @@ allow the online fsck developers to compare online fsck against offline fsck, and they enable XFS developers to find deficiencies in the code base. Proposed patchsets include -`general fuzzer improvements -`_, `fuzzing baselines -`_, -and `improvements in fuzz testing comprehensiveness -`_. +`_. Stress Testing -------------- @@ -801,11 +795,6 @@ Success is defined by the ability to run all of these tests without observing any unexpected filesystem shutdowns due to corrupted metadata, kernel hang check warnings, or any other sort of mischief. -Proposed patchsets include `general stress testing -`_ -and the `evolution of existing per-function stress testing -`_. - 4. User Interface ================= @@ -886,10 +875,6 @@ apply as nice of a priority to IO and CPU scheduling as possible. This measure was taken to minimize delays in the rest of the filesystem. No such hardening has been performed for the cron job. -Proposed patchset: -`Enabling the xfs_scrub background service -`_. - Health Reporting ---------------- @@ -912,13 +897,6 @@ notifications and initiate a repair? *Answer*: These questions remain unanswered, but should be a part of the conversation with early adopters and potential downstream users of XFS. -Proposed patchsets include -`wiring up health reports to correction returns -`_ -and -`preservation of sickness info during memory reclaim -`_. - 5. Kernel Algorithms and Data Structures ======================================== @@ -1310,21 +1288,6 @@ Space allocation records are cross-referenced as follows: are there the same number of reverse mapping records for each block as the reference count record claims? -Proposed patchsets are the series to find gaps in -`refcount btree -`_, -`inode btree -`_, and -`rmap btree -`_ records; -to find -`mergeable records -`_; -and to -`improve cross referencing with rmap -`_ -before starting a repair. - Checking Extended Attributes ```````````````````````````` @@ -1756,10 +1719,6 @@ For scrub, the drain works as follows: To avoid polling in step 4, the drain provides a waitqueue for scrub threads to be woken up whenever the intent count drops to zero. -The proposed patchset is the -`scrub intent drain series -`_. - .. _jump_labels: Static Keys (aka Jump Label Patching) @@ -2036,10 +1995,6 @@ The ``xfarray_store_anywhere`` function is used to insert a record in any null record slot in the bag; and the ``xfarray_unset`` function removes a record from the bag. -The proposed patchset is the -`big in-memory array -`_. - Iterating Array Elements ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -2172,10 +2127,6 @@ However, it should be noted that these repair functions only use blob storage to cache a small number of entries before adding them to a temporary ondisk file, which is why compaction is not required. -The proposed patchset is at the start of the -`extended attribute repair -`_ series. - .. _xfbtree: In-Memory B+Trees @@ -2214,11 +2165,6 @@ xfiles enables reuse of the entire btree library. Btrees built atop an xfile are collectively known as ``xfbtrees``. The next few sections describe how they actually work. -The proposed patchset is the -`in-memory btree -`_ -series. - Using xfiles as a Buffer Cache Target ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -2459,14 +2405,6 @@ This enables the log to release the old EFI to keep the log moving forwards. EFIs have a role to play during the commit and reaping phases; please see the next section and the section about :ref:`reaping` for more details. -Proposed patchsets are the -`bitmap rework -`_ -and the -`preparation for bulk loading btrees -`_. - - Writing the New Tree ```````````````````` @@ -2623,11 +2561,6 @@ The number of records for the inode btree is the number of xfarray records, but the record count for the free inode btree has to be computed as inode chunk records are stored in the xfarray. -The proposed patchset is the -`AG btree repair -`_ -series. - Case Study: Rebuilding the Space Reference Counts ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -2716,11 +2649,6 @@ Reverse mappings are added to the bag using ``xfarray_store_anywhere`` and removed via ``xfarray_unset``. Bag members are examined through ``xfarray_iter`` loops. -The proposed patchset is the -`AG btree repair -`_ -series. - Case Study: Rebuilding File Fork Mapping Indices ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -2757,11 +2685,6 @@ EXTENTS format instead of BMBT, which may require a conversion. Third, the incore extent map must be reloaded carefully to avoid disturbing any delayed allocation extents. -The proposed patchset is the -`file mapping repair -`_ -series. - .. _reaping: Reaping Old Metadata Blocks @@ -2843,11 +2766,6 @@ blocks. As stated earlier, online repair functions use very large transactions to minimize the chances of this occurring. -The proposed patchset is the -`preparation for bulk loading btrees -`_ -series. - Case Study: Reaping After a Regular Btree Repair ```````````````````````````````````````````````` @@ -2943,11 +2861,6 @@ When the walk is complete, the bitmap disunion operation ``(ag_owner_bitmap & btrees. These blocks can then be reaped using the methods outlined above. -The proposed patchset is the -`AG btree repair -`_ -series. - .. _rmap_reap: Case Study: Reaping After Repairing Reverse Mapping Btrees @@ -2972,11 +2885,6 @@ methods outlined above. The rest of the process of rebuildng the reverse mapping btree is discussed in a separate :ref:`case study`. -The proposed patchset is the -`AG btree repair -`_ -series. - Case Study: Rebuilding the AGFL ``````````````````````````````` @@ -3024,11 +2932,6 @@ more complicated, because computing the correct value requires traversing the forks, or if that fails, leaving the fields invalid and waiting for the fork fsck functions to run. -The proposed patchset is the -`inode -`_ -repair series. - Quota Record Repairs -------------------- @@ -3045,11 +2948,6 @@ checking are obviously bad limits and timer values. Quota usage counters are checked, repaired, and discussed separately in the section about :ref:`live quotacheck `. -The proposed patchset is the -`quota -`_ -repair series. - .. _fscounters: Freezing to Fix Summary Counters @@ -3145,11 +3043,6 @@ long enough to check and correct the summary counters. | This bug was fixed in Linux 5.17. | +--------------------------------------------------------------------------+ -The proposed patchset is the -`summary counter cleanup -`_ -series. - Full Filesystem Scans --------------------- @@ -3277,15 +3170,6 @@ Second, if the incore inode is stuck in some intermediate state, the scan coordinator must release the AGI and push the main filesystem to get the inode back into a loadable state. -The proposed patches are the -`inode scanner -`_ -series. -The first user of the new functionality is the -`online quotacheck -`_ -series. - Inode Management ```````````````` @@ -3381,12 +3265,6 @@ To capture these nuances, the online fsck code has a separate ``xchk_irele`` function to set or clear the ``DONTCACHE`` flag to get the required release behavior. -Proposed patchsets include fixing -`scrub iget usage -`_ and -`dir iget usage -`_. - .. _ilocking: Locking Inodes @@ -3443,11 +3321,6 @@ If the dotdot entry changes while the directory is unlocked, then a move or rename operation must have changed the child's parentage, and the scan can exit early. -The proposed patchset is the -`directory repair -`_ -series. - .. _fshooks: Filesystem Hooks @@ -3594,11 +3467,6 @@ The inode scan APIs are pretty simple: - ``xchk_iscan_teardown`` to finish the scan -This functionality is also a part of the -`inode scanner -`_ -series. - .. _quotacheck: Case Study: Quota Counter Checking @@ -3686,11 +3554,6 @@ needing to hold any locks for a long duration. If repairs are desired, the real and shadow dquots are locked and their resource counts are set to the values in the shadow dquot. -The proposed patchset is the -`online quotacheck -`_ -series. - .. _nlinks: Case Study: File Link Count Checking @@ -3744,11 +3607,6 @@ shadow information. If no parents are found, the file must be :ref:`reparented ` to the orphanage to prevent the file from being lost forever. -The proposed patchset is the -`file link count repair -`_ -series. - .. _rmap_repair: Case Study: Rebuilding Reverse Mapping Records @@ -3828,11 +3686,6 @@ scan for reverse mapping records. 12. Free the xfbtree now that it not needed. -The proposed patchset is the -`rmap repair -`_ -series. - Staging Repairs with Temporary Files on Disk -------------------------------------------- @@ -3971,11 +3824,6 @@ Once a good copy of a data file has been constructed in a temporary file, it must be conveyed to the file being repaired, which is the topic of the next section. -The proposed patches are in the -`repair temporary files -`_ -series. - Logged File Content Exchanges ----------------------------- @@ -4025,11 +3873,6 @@ The new ``XFS_SB_FEAT_INCOMPAT_EXCHRANGE`` incompatible feature flag in the superblock protects these new log item records from being replayed on old kernels. -The proposed patchset is the -`file contents exchange -`_ -series. - +--------------------------------------------------------------------------+ | **Sidebar: Using Log-Incompatible Feature Flags** | +--------------------------------------------------------------------------+ @@ -4323,11 +4166,6 @@ To repair the summary file, write the xfile contents into the temporary file and use atomic mapping exchange to commit the new contents. The temporary file is then reaped. -The proposed patchset is the -`realtime summary repair -`_ -series. - Case Study: Salvaging Extended Attributes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -4369,11 +4207,6 @@ Salvaging extended attributes is done as follows: 4. Reap the temporary file. -The proposed patchset is the -`extended attribute repair -`_ -series. - Fixing Directories ------------------ @@ -4448,11 +4281,6 @@ Unfortunately, the current dentry cache design doesn't provide a means to walk every child dentry of a specific directory, which makes this a hard problem. There is no known solution. -The proposed patchset is the -`directory repair -`_ -series. - Parent Pointers ``````````````` @@ -4612,11 +4440,6 @@ a :ref:`directory entry live update hook ` as follows: 7. Reap the temporary directory. -The proposed patchset is the -`parent pointers directory repair -`_ -series. - Case Study: Repairing Parent Pointers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -4662,11 +4485,6 @@ directory reconstruction: 8. Reap the temporary file. -The proposed patchset is the -`parent pointers repair -`_ -series. - Digression: Offline Checking of Parent Pointers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -4755,11 +4573,6 @@ connectivity checks: 4. Move on to examining link counts, as we do today. -The proposed patchset is the -`offline parent pointers repair -`_ -series. - Rebuilding directories from parent pointers in offline repair would be very challenging because xfs_repair currently uses two single-pass scans of the filesystem during phases 3 and 4 to decide which files are corrupt enough to be @@ -4903,12 +4716,6 @@ Repairing the directory tree works as follows: 6. If the subdirectory has zero paths, attach it to the lost and found. -The proposed patches are in the -`directory tree repair -`_ -series. - - .. _orphanage: The Orphanage @@ -4973,11 +4780,6 @@ Orphaned files are adopted by the orphanage as follows: 7. If a runtime error happens, call ``xrep_adoption_cancel`` to release all resources. -The proposed patches are in the -`orphanage adoption -`_ -series. - 6. Userspace Algorithms and Data Structures =========================================== @@ -5091,14 +4893,6 @@ first workqueue's workers until the backlog eases. This doesn't completely solve the balancing problem, but reduces it enough to move on to more pressing issues. -The proposed patchsets are the scrub -`performance tweaks -`_ -and the -`inode scan rebalance -`_ -series. - .. _scrubrepair: Scheduling Repairs @@ -5179,20 +4973,6 @@ immediately. Corrupt file data blocks reported by phase 6 cannot be recovered by the filesystem. -The proposed patchsets are the -`repair warning improvements -`_, -refactoring of the -`repair data dependency -`_ -and -`object tracking -`_, -and the -`repair scheduling -`_ -improvement series. - Checking Names for Confusable Unicode Sequences ----------------------------------------------- @@ -5372,6 +5152,8 @@ The extra flexibility enables several new use cases: This emulates an atomic device write in software, and can support arbitrary scattered writes. +(This functionality was merged into mainline as of 2025) + Vectorized Scrub ---------------- @@ -5393,13 +5175,7 @@ It is hoped that ``io_uring`` will pick up enough of this functionality that online fsck can use that instead of adding a separate vectored scrub system call to XFS. -The relevant patchsets are the -`kernel vectorized scrub -`_ -and -`userspace vectorized scrub -`_ -series. +(This functionality was merged into mainline as of 2025) Quality of Service Targets for Scrub ------------------------------------